ceph/src/rocksdb/db/db_range_del_test.cc

   1 //  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
   2 //  This source code is licensed under the BSD-style license found in the
   3 //  LICENSE file in the root directory of this source tree. An additional grant
   4 //  of patent rights can be found in the PATENTS file in the same directory.
   5
   6 #include "db/db_test_util.h"
   7 #include "port/stack_trace.h"
   8 #include "util/testutil.h"
   9 #include "utilities/merge_operators.h"
  10
  11 namespace rocksdb {
  12
  13 class DBRangeDelTest : public DBTestBase {
  14  public:
  15   DBRangeDelTest() : DBTestBase("/db_range_del_test") {}
  16
  17   std::string GetNumericStr(int key) {
  18     uint64_t uint64_key = static_cast<uint64_t>(key);
  19     std::string str;
  20     str.resize(8);
  21     memcpy(&str[0], static_cast<void*>(&uint64_key), 8);
  22     return str;
  23   }
  24 };
  25
  26 // PlainTableFactory and NumTableFilesAtLevel() are not supported in
  27 // ROCKSDB_LITE
  28 #ifndef ROCKSDB_LITE
  29 TEST_F(DBRangeDelTest, NonBlockBasedTableNotSupported) {
  30   Options opts = CurrentOptions();
  31   opts.table_factory.reset(new PlainTableFactory());
  32   opts.prefix_extractor.reset(NewNoopTransform());
  33   opts.allow_mmap_reads = true;
  34   opts.max_sequential_skip_in_iterations = 999999;
  35   Reopen(opts);
  36
  37   ASSERT_TRUE(
  38       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1", "dr1")
  39           .IsNotSupported());
  40 }
  41
  42 TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) {
  43   ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
  44                              "dr2"));
  45   ASSERT_OK(db_->Flush(FlushOptions()));
  46   ASSERT_EQ(1, NumTableFilesAtLevel(0));
  47 }
  48
  49 TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) {
  50   Options opts = CurrentOptions();
  51   opts.disable_auto_compactions = true;
  52   opts.statistics = CreateDBStatistics();
  53   Reopen(opts);
  54
  55   // snapshot protects range tombstone from dropping due to becoming obsolete.
  56   const Snapshot* snapshot = db_->GetSnapshot();
  57   db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z");
  58   db_->Flush(FlushOptions());
  59
  60   ASSERT_EQ(1, NumTableFilesAtLevel(0));
  61   ASSERT_EQ(0, NumTableFilesAtLevel(1));
  62   dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
  63                               true /* disallow_trivial_move */);
  64   ASSERT_EQ(0, NumTableFilesAtLevel(0));
  65   ASSERT_EQ(1, NumTableFilesAtLevel(1));
  66   ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
  67   db_->ReleaseSnapshot(snapshot);
  68 }
  69
  70 TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) {
  71   // regression test for exactly filled compaction output files. Previously
  72   // another file would be generated containing all range deletions, which
  73   // could invalidate the non-overlapping file boundary invariant.
  74   const int kNumPerFile = 4, kNumFiles = 2, kFileBytes = 9 << 10;
  75   Options options = CurrentOptions();
  76   options.disable_auto_compactions = true;
  77   options.level0_file_num_compaction_trigger = kNumFiles;
  78   options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
  79   options.num_levels = 2;
  80   options.target_file_size_base = kFileBytes;
  81   BlockBasedTableOptions table_options;
  82   table_options.block_size_deviation = 50;  // each block holds two keys
  83   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
  84   Reopen(options);
  85
  86   // snapshot protects range tombstone from dropping due to becoming obsolete.
  87   const Snapshot* snapshot = db_->GetSnapshot();
  88   db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), Key(1));
  89
  90   Random rnd(301);
  91   for (int i = 0; i < kNumFiles; ++i) {
  92     std::vector<std::string> values;
  93     // Write 12K (4 values, each 3K)
  94     for (int j = 0; j < kNumPerFile; j++) {
  95       values.push_back(RandomString(&rnd, 3 << 10));
  96       ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
  97       if (j == 0 && i > 0) {
  98         dbfull()->TEST_WaitForFlushMemTable();
  99       }
 100     }
 101   }
 102   // put extra key to trigger final flush
 103   ASSERT_OK(Put("", ""));
 104   dbfull()->TEST_WaitForFlushMemTable();
 105   ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
 106   ASSERT_EQ(0, NumTableFilesAtLevel(1));
 107
 108   dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
 109                               true /* disallow_trivial_move */);
 110   ASSERT_EQ(0, NumTableFilesAtLevel(0));
 111   ASSERT_EQ(2, NumTableFilesAtLevel(1));
 112   db_->ReleaseSnapshot(snapshot);
 113 }
 114
 115 TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) {
 116   // Ensures range deletion spanning multiple compaction output files that are
 117   // cut by max_compaction_bytes will have non-overlapping key-ranges.
 118   // https://github.com/facebook/rocksdb/issues/1778
 119   const int kNumFiles = 2, kNumPerFile = 1 << 8, kBytesPerVal = 1 << 12;
 120   Options opts = CurrentOptions();
 121   opts.comparator = test::Uint64Comparator();
 122   opts.disable_auto_compactions = true;
 123   opts.level0_file_num_compaction_trigger = kNumFiles;
 124   opts.max_compaction_bytes = kNumPerFile * kBytesPerVal;
 125   opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
 126   // Want max_compaction_bytes to trigger the end of compaction output file, not
 127   // target_file_size_base, so make the latter much bigger
 128   opts.target_file_size_base = 100 * opts.max_compaction_bytes;
 129   Reopen(opts);
 130
 131   // snapshot protects range tombstone from dropping due to becoming obsolete.
 132   const Snapshot* snapshot = db_->GetSnapshot();
 133
 134   // It spans the whole key-range, thus will be included in all output files
 135   ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
 136                              GetNumericStr(0),
 137                              GetNumericStr(kNumFiles * kNumPerFile - 1)));
 138   Random rnd(301);
 139   for (int i = 0; i < kNumFiles; ++i) {
 140     std::vector<std::string> values;
 141     // Write 1MB (256 values, each 4K)
 142     for (int j = 0; j < kNumPerFile; j++) {
 143       values.push_back(RandomString(&rnd, kBytesPerVal));
 144       ASSERT_OK(Put(GetNumericStr(kNumPerFile * i + j), values[j]));
 145     }
 146     // extra entry to trigger SpecialSkipListFactory's flush
 147     ASSERT_OK(Put(GetNumericStr(kNumPerFile), ""));
 148     dbfull()->TEST_WaitForFlushMemTable();
 149     ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
 150   }
 151
 152   dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
 153                               true /* disallow_trivial_move */);
 154   ASSERT_EQ(0, NumTableFilesAtLevel(0));
 155   ASSERT_GE(NumTableFilesAtLevel(1), 2);
 156
 157   std::vector<std::vector<FileMetaData>> files;
 158   dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
 159
 160   for (size_t i = 0; i < files[1].size() - 1; ++i) {
 161     ASSERT_TRUE(InternalKeyComparator(opts.comparator)
 162                     .Compare(files[1][i].largest, files[1][i + 1].smallest) <
 163                 0);
 164   }
 165   db_->ReleaseSnapshot(snapshot);
 166 }
 167
 168 TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) {
 169   // Regression test for bug where sentinel range deletions (i.e., ones with
 170   // sequence number of zero) were included in output files.
 171   // snapshot protects range tombstone from dropping due to becoming obsolete.
 172   const Snapshot* snapshot = db_->GetSnapshot();
 173
 174   // gaps between ranges creates sentinels in our internal representation
 175   std::vector<std::pair<std::string, std::string>> range_dels = {{"a", "b"}, {"c", "d"}, {"e", "f"}};
 176   for (const auto& range_del : range_dels) {
 177     ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
 178                                range_del.first, range_del.second));
 179   }
 180   ASSERT_OK(db_->Flush(FlushOptions()));
 181   ASSERT_EQ(1, NumTableFilesAtLevel(0));
 182
 183   std::vector<std::vector<FileMetaData>> files;
 184   dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
 185   ASSERT_GT(files[0][0].smallest_seqno, 0);
 186
 187   db_->ReleaseSnapshot(snapshot);
 188 }
 189
 190 TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) {
 191   db_->Put(WriteOptions(), "b1", "val");
 192   ASSERT_OK(
 193       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
 194   db_->Put(WriteOptions(), "b2", "val");
 195   ASSERT_OK(
 196       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
 197   // first iteration verifies query correctness in memtable, second verifies
 198   // query correctness for a single SST file
 199   for (int i = 0; i < 2; ++i) {
 200     if (i > 0) {
 201       ASSERT_OK(db_->Flush(FlushOptions()));
 202       ASSERT_EQ(1, NumTableFilesAtLevel(0));
 203     }
 204     std::string value;
 205     ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
 206     ASSERT_OK(db_->Get(ReadOptions(), "b2", &value));
 207   }
 208 }
 209
 210 TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) {
 211   db_->Put(WriteOptions(), "unused", "val");  // prevents empty after compaction
 212   db_->Put(WriteOptions(), "b1", "val");
 213   ASSERT_OK(db_->Flush(FlushOptions()));
 214   ASSERT_OK(
 215       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
 216   ASSERT_OK(db_->Flush(FlushOptions()));
 217   ASSERT_OK(
 218       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
 219   ASSERT_OK(db_->Flush(FlushOptions()));
 220   ASSERT_EQ(3, NumTableFilesAtLevel(0));
 221
 222   for (int i = 0; i < 2; ++i) {
 223     if (i > 0) {
 224       dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
 225                                   true /* disallow_trivial_move */);
 226       ASSERT_EQ(0, NumTableFilesAtLevel(0));
 227       ASSERT_EQ(1, NumTableFilesAtLevel(1));
 228     }
 229     std::string value;
 230     ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
 231   }
 232 }
 233 #endif  // ROCKSDB_LITE
 234
 235 TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) {
 236   const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250;
 237   Options opts = CurrentOptions();
 238   opts.comparator = test::Uint64Comparator();
 239   Reopen(opts);
 240
 241   // Write a third before snapshot, a third between snapshot and tombstone, and
 242   // a third after the tombstone. Keys older than snapshot or newer than the
 243   // tombstone should be preserved.
 244   const Snapshot* snapshot = nullptr;
 245   for (int i = 0; i < kNum; ++i) {
 246     if (i == kNum / 3) {
 247       snapshot = db_->GetSnapshot();
 248     } else if (i == 2 * kNum / 3) {
 249       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
 250                        GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
 251     }
 252     db_->Put(WriteOptions(), GetNumericStr(i), "val");
 253   }
 254   db_->Flush(FlushOptions());
 255
 256   for (int i = 0; i < kNum; ++i) {
 257     ReadOptions read_opts;
 258     read_opts.ignore_range_deletions = true;
 259     std::string value;
 260     if (i < kRangeBegin || i > kRangeEnd || i < kNum / 3 || i >= 2 * kNum / 3) {
 261       ASSERT_OK(db_->Get(read_opts, GetNumericStr(i), &value));
 262     } else {
 263       ASSERT_TRUE(db_->Get(read_opts, GetNumericStr(i), &value).IsNotFound());
 264     }
 265   }
 266   db_->ReleaseSnapshot(snapshot);
 267 }
 268
 269 // NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
 270 #ifndef ROCKSDB_LITE
 271 TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) {
 272   const int kNumPerFile = 100, kNumFiles = 4;
 273   Options opts = CurrentOptions();
 274   opts.comparator = test::Uint64Comparator();
 275   opts.disable_auto_compactions = true;
 276   opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
 277   opts.num_levels = 2;
 278   opts.statistics = CreateDBStatistics();
 279   Reopen(opts);
 280
 281   for (int i = 0; i < kNumFiles; ++i) {
 282     if (i > 0) {
 283       // range tombstone covers first half of the previous file
 284       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
 285                        GetNumericStr((i - 1) * kNumPerFile),
 286                        GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2));
 287     }
 288     // Make sure a given key appears in each file so compaction won't be able to
 289     // use trivial move, which would happen if the ranges were non-overlapping.
 290     // Also, we need an extra element since flush is only triggered when the
 291     // number of keys is one greater than SpecialSkipListFactory's limit.
 292     // We choose a key outside the key-range used by the test to avoid conflict.
 293     db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles), "val");
 294
 295     for (int j = 0; j < kNumPerFile; ++j) {
 296       db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val");
 297     }
 298     dbfull()->TEST_WaitForFlushMemTable();
 299     ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
 300   }
 301   db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
 302   ASSERT_EQ(0, NumTableFilesAtLevel(0));
 303   ASSERT_GT(NumTableFilesAtLevel(1), 0);
 304   ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2,
 305             TestGetTickerCount(opts, COMPACTION_KEY_DROP_RANGE_DEL));
 306
 307   for (int i = 0; i < kNumFiles; ++i) {
 308     for (int j = 0; j < kNumPerFile; ++j) {
 309       ReadOptions read_opts;
 310       read_opts.ignore_range_deletions = true;
 311       std::string value;
 312       if (i == kNumFiles - 1 || j >= kNumPerFile / 2) {
 313         ASSERT_OK(
 314             db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value));
 315       } else {
 316         ASSERT_TRUE(
 317             db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value)
 318                 .IsNotFound());
 319       }
 320     }
 321   }
 322 }
 323
 324 TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) {
 325   const int kNumPerFile = 100, kNumFiles = 4, kFileBytes = 100 << 10;
 326   Options options = CurrentOptions();
 327   options.disable_auto_compactions = true;
 328   options.level0_file_num_compaction_trigger = kNumFiles;
 329   options.max_bytes_for_level_base = 2 * kFileBytes;
 330   options.max_subcompactions = 4;
 331   options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
 332   options.num_levels = 3;
 333   options.target_file_size_base = kFileBytes;
 334   options.target_file_size_multiplier = 1;
 335   Reopen(options);
 336
 337   Random rnd(301);
 338   for (int i = 0; i < 2; ++i) {
 339     for (int j = 0; j < kNumFiles; ++j) {
 340       if (i > 0) {
 341         // delete [95,105) in two files, [295,305) in next two
 342         int mid = (j + (1 - j % 2)) * kNumPerFile;
 343         db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
 344                          Key(mid - 5), Key(mid + 5));
 345       }
 346       std::vector<std::string> values;
 347       // Write 100KB (100 values, each 1K)
 348       for (int k = 0; k < kNumPerFile; k++) {
 349         values.push_back(RandomString(&rnd, 990));
 350         ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
 351       }
 352       // put extra key to trigger flush
 353       ASSERT_OK(Put("", ""));
 354       dbfull()->TEST_WaitForFlushMemTable();
 355       if (j < kNumFiles - 1) {
 356         // background compaction may happen early for kNumFiles'th file
 357         ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
 358       }
 359       if (j == options.level0_file_num_compaction_trigger - 1) {
 360         // When i == 1, compaction will output some files to L1, at which point
 361         // L1 is not bottommost so range deletions cannot be compacted away. The
 362         // new L1 files must be generated with non-overlapping key ranges even
 363         // though multiple subcompactions see the same ranges deleted, else an
 364         // assertion will fail.
 365         //
 366         // Only enable auto-compactions when we're ready; otherwise, the
 367         // oversized L0 (relative to base_level) causes the compaction to run
 368         // earlier.
 369         ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
 370         dbfull()->TEST_WaitForCompact();
 371         ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
 372                                   {{"disable_auto_compactions", "true"}}));
 373         ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 374         ASSERT_GT(NumTableFilesAtLevel(1), 0);
 375         ASSERT_GT(NumTableFilesAtLevel(2), 0);
 376       }
 377     }
 378   }
 379 }
 380
 381 TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) {
 382   const int kNumPerFile = 100, kFilesPerLevel = 4, kNumLevels = 4;
 383   Options options = CurrentOptions();
 384   options.compaction_options_universal.min_merge_width = kFilesPerLevel;
 385   options.compaction_options_universal.max_merge_width = kFilesPerLevel;
 386   options.compaction_options_universal.size_ratio = 10;
 387   options.compaction_style = kCompactionStyleUniversal;
 388   options.level0_file_num_compaction_trigger = kFilesPerLevel;
 389   options.max_subcompactions = 4;
 390   options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
 391   options.num_levels = kNumLevels;
 392   options.target_file_size_base = kNumPerFile << 10;
 393   options.target_file_size_multiplier = 1;
 394   Reopen(options);
 395
 396   Random rnd(301);
 397   for (int i = 0; i < kNumLevels - 1; ++i) {
 398     for (int j = 0; j < kFilesPerLevel; ++j) {
 399       if (i == kNumLevels - 2) {
 400         // insert range deletions [95,105) in two files, [295,305) in next two
 401         // to prepare L1 for later manual compaction.
 402         int mid = (j + (1 - j % 2)) * kNumPerFile;
 403         db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
 404                          Key(mid - 5), Key(mid + 5));
 405       }
 406       std::vector<std::string> values;
 407       // Write 100KB (100 values, each 1K)
 408       for (int k = 0; k < kNumPerFile; k++) {
 409         values.push_back(RandomString(&rnd, 990));
 410         ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
 411       }
 412       // put extra key to trigger flush
 413       ASSERT_OK(Put("", ""));
 414       dbfull()->TEST_WaitForFlushMemTable();
 415       if (j < kFilesPerLevel - 1) {
 416         // background compaction may happen early for kFilesPerLevel'th file
 417         ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
 418       }
 419     }
 420     dbfull()->TEST_WaitForCompact();
 421     ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 422     ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1);
 423   }
 424   // Now L1-L3 are full, when we compact L1->L2 we should see (1) subcompactions
 425   // happen since input level > 0; (2) range deletions are not dropped since
 426   // output level is not bottommost. If no file boundary assertion fails, that
 427   // probably means universal compaction + subcompaction + range deletion are
 428   // compatible.
 429   ASSERT_OK(dbfull()->RunManualCompaction(
 430       reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
 431           ->cfd(),
 432       1 /* input_level */, 2 /* output_level */, 0 /* output_path_id */,
 433       nullptr /* begin */, nullptr /* end */, true /* exclusive */,
 434       true /* disallow_trivial_move */));
 435 }
 436 #endif  // ROCKSDB_LITE
 437
 438 TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) {
 439   const int kNumPerFile = 3, kNumFiles = 3;
 440   Options opts = CurrentOptions();
 441   opts.disable_auto_compactions = true;
 442   opts.memtable_factory.reset(new SpecialSkipListFactory(2 * kNumPerFile));
 443   opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
 444   opts.num_levels = 2;
 445   Reopen(opts);
 446
 447   // Iterates kNumFiles * kNumPerFile + 1 times since flushing the last file
 448   // requires an extra entry.
 449   for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) {
 450     if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) {
 451       // Delete merge operands from all but the last file
 452       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
 453                        "key_");
 454     }
 455     std::string val;
 456     PutFixed64(&val, i);
 457     db_->Merge(WriteOptions(), "key", val);
 458     // we need to prevent trivial move using Puts so compaction will actually
 459     // process the merge operands.
 460     db_->Put(WriteOptions(), "prevent_trivial_move", "");
 461     if (i > 0 && i % kNumPerFile == 0) {
 462       dbfull()->TEST_WaitForFlushMemTable();
 463     }
 464   }
 465
 466   ReadOptions read_opts;
 467   read_opts.ignore_range_deletions = true;
 468   std::string expected, actual;
 469   ASSERT_OK(db_->Get(read_opts, "key", &actual));
 470   PutFixed64(&expected, 45);  // 1+2+...+9
 471   ASSERT_EQ(expected, actual);
 472
 473   db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
 474
 475   expected.clear();
 476   ASSERT_OK(db_->Get(read_opts, "key", &actual));
 477   uint64_t tmp;
 478   Slice tmp2(actual);
 479   GetFixed64(&tmp2, &tmp);
 480   PutFixed64(&expected, 30);  // 6+7+8+9 (earlier operands covered by tombstone)
 481   ASSERT_EQ(expected, actual);
 482 }
 483
 484 // NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
 485 #ifndef ROCKSDB_LITE
 486 TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) {
 487   // During compaction to bottommost level, verify range tombstones older than
 488   // the oldest snapshot are removed, while others are preserved.
 489   Options opts = CurrentOptions();
 490   opts.disable_auto_compactions = true;
 491   opts.num_levels = 2;
 492   opts.statistics = CreateDBStatistics();
 493   Reopen(opts);
 494
 495   db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
 496                    "dr1");  // obsolete after compaction
 497   db_->Put(WriteOptions(), "key", "val");
 498   db_->Flush(FlushOptions());
 499   const Snapshot* snapshot = db_->GetSnapshot();
 500   db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2",
 501                    "dr2");  // protected by snapshot
 502   db_->Put(WriteOptions(), "key", "val");
 503   db_->Flush(FlushOptions());
 504
 505   ASSERT_EQ(2, NumTableFilesAtLevel(0));
 506   ASSERT_EQ(0, NumTableFilesAtLevel(1));
 507   db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
 508   ASSERT_EQ(0, NumTableFilesAtLevel(0));
 509   ASSERT_EQ(1, NumTableFilesAtLevel(1));
 510   ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
 511
 512   db_->ReleaseSnapshot(snapshot);
 513 }
 514
 515 TEST_F(DBRangeDelTest, TableEvictedDuringScan) {
 516   // The RangeDelAggregator holds pointers into range deletion blocks created by
 517   // table readers. This test ensures the aggregator can still access those
 518   // blocks even if it outlives the table readers that created them.
 519   //
 520   // DBIter always keeps readers open for L0 files. So, in order to test
 521   // aggregator outliving reader, we need to have deletions in L1 files, which
 522   // are opened/closed on-demand during the scan. This is accomplished by
 523   // setting kNumRanges > level0_stop_writes_trigger, which prevents deletions
 524   // from all lingering in L0 (there is at most one range deletion per L0 file).
 525   //
 526   // The first L1 file will contain a range deletion since its begin key is 0.
 527   // SeekToFirst() references that table's reader and adds its range tombstone
 528   // to the aggregator. Upon advancing beyond that table's key-range via Next(),
 529   // the table reader will be unreferenced by the iterator. Since we manually
 530   // call Evict() on all readers before the full scan, this unreference causes
 531   // the reader's refcount to drop to zero and thus be destroyed.
 532   //
 533   // When it is destroyed, we do not remove its range deletions from the
 534   // aggregator. So, subsequent calls to Next() must be able to use these
 535   // deletions to decide whether a key is covered. This will work as long as
 536   // the aggregator properly references the range deletion block.
 537   const int kNum = 25, kRangeBegin = 0, kRangeEnd = 7, kNumRanges = 5;
 538   Options opts = CurrentOptions();
 539   opts.comparator = test::Uint64Comparator();
 540   opts.level0_file_num_compaction_trigger = 4;
 541   opts.level0_stop_writes_trigger = 4;
 542   opts.memtable_factory.reset(new SpecialSkipListFactory(1));
 543   opts.num_levels = 2;
 544   BlockBasedTableOptions bbto;
 545   bbto.cache_index_and_filter_blocks = true;
 546   bbto.block_cache = NewLRUCache(8 << 20);
 547   opts.table_factory.reset(NewBlockBasedTableFactory(bbto));
 548   Reopen(opts);
 549
 550   // Hold a snapshot so range deletions can't become obsolete during compaction
 551   // to bottommost level (i.e., L1).
 552   const Snapshot* snapshot = db_->GetSnapshot();
 553   for (int i = 0; i < kNum; ++i) {
 554     db_->Put(WriteOptions(), GetNumericStr(i), "val");
 555     if (i > 0) {
 556       dbfull()->TEST_WaitForFlushMemTable();
 557     }
 558     if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) {
 559       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
 560                        GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
 561     }
 562   }
 563   // Must be > 1 so the first L1 file can be closed before scan finishes
 564   dbfull()->TEST_WaitForCompact();
 565   ASSERT_GT(NumTableFilesAtLevel(1), 1);
 566   std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
 567
 568   ReadOptions read_opts;
 569   auto* iter = db_->NewIterator(read_opts);
 570   int expected = kRangeEnd;
 571   iter->SeekToFirst();
 572   for (auto file_number : file_numbers) {
 573     // This puts table caches in the state of being externally referenced only
 574     // so they are destroyed immediately upon iterator unreferencing.
 575     TableCache::Evict(dbfull()->TEST_table_cache(), file_number);
 576   }
 577   for (; iter->Valid(); iter->Next()) {
 578     ASSERT_EQ(GetNumericStr(expected), iter->key());
 579     ++expected;
 580     // Keep clearing block cache's LRU so range deletion block can be freed as
 581     // soon as its refcount drops to zero.
 582     bbto.block_cache->EraseUnRefEntries();
 583   }
 584   ASSERT_EQ(kNum, expected);
 585   delete iter;
 586   db_->ReleaseSnapshot(snapshot);
 587 }
 588
 589 TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) {
 590   db_->Put(WriteOptions(), "key", "val");
 591   ASSERT_OK(
 592       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 593
 594   ReadOptions read_opts;
 595   std::string value;
 596   ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
 597 }
 598
 599 TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) {
 600   Options opts = CurrentOptions();
 601   opts.max_write_buffer_number = 3;
 602   opts.min_write_buffer_number_to_merge = 2;
 603   // SpecialSkipListFactory lets us specify maximum number of elements the
 604   // memtable can hold. It switches the active memtable to immutable (flush is
 605   // prevented by the above options) upon inserting an element that would
 606   // overflow the memtable.
 607   opts.memtable_factory.reset(new SpecialSkipListFactory(1));
 608   Reopen(opts);
 609
 610   db_->Put(WriteOptions(), "key", "val");
 611   ASSERT_OK(
 612       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 613   db_->Put(WriteOptions(), "blah", "val");
 614
 615   ReadOptions read_opts;
 616   std::string value;
 617   ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
 618 }
 619
 620 TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) {
 621   db_->Put(WriteOptions(), "key", "val");
 622   // snapshot prevents key from being deleted during flush
 623   const Snapshot* snapshot = db_->GetSnapshot();
 624   ASSERT_OK(
 625       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 626   ASSERT_OK(db_->Flush(FlushOptions()));
 627
 628   ReadOptions read_opts;
 629   std::string value;
 630   ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
 631   db_->ReleaseSnapshot(snapshot);
 632 }
 633
 634 TEST_F(DBRangeDelTest, GetCoveredMergeOperandFromMemtable) {
 635   const int kNumMergeOps = 10;
 636   Options opts = CurrentOptions();
 637   opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
 638   Reopen(opts);
 639
 640   for (int i = 0; i < kNumMergeOps; ++i) {
 641     std::string val;
 642     PutFixed64(&val, i);
 643     db_->Merge(WriteOptions(), "key", val);
 644     if (i == kNumMergeOps / 2) {
 645       // deletes [0, 5]
 646       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
 647                        "key_");
 648     }
 649   }
 650
 651   ReadOptions read_opts;
 652   std::string expected, actual;
 653   ASSERT_OK(db_->Get(read_opts, "key", &actual));
 654   PutFixed64(&expected, 30);  // 6+7+8+9
 655   ASSERT_EQ(expected, actual);
 656
 657   expected.clear();
 658   read_opts.ignore_range_deletions = true;
 659   ASSERT_OK(db_->Get(read_opts, "key", &actual));
 660   PutFixed64(&expected, 45);  // 0+1+2+...+9
 661   ASSERT_EQ(expected, actual);
 662 }
 663
 664 TEST_F(DBRangeDelTest, GetIgnoresRangeDeletions) {
 665   Options opts = CurrentOptions();
 666   opts.max_write_buffer_number = 4;
 667   opts.min_write_buffer_number_to_merge = 3;
 668   opts.memtable_factory.reset(new SpecialSkipListFactory(1));
 669   Reopen(opts);
 670
 671   db_->Put(WriteOptions(), "sst_key", "val");
 672   // snapshot prevents key from being deleted during flush
 673   const Snapshot* snapshot = db_->GetSnapshot();
 674   ASSERT_OK(
 675       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 676   ASSERT_OK(db_->Flush(FlushOptions()));
 677   db_->Put(WriteOptions(), "imm_key", "val");
 678   ASSERT_OK(
 679       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 680   db_->Put(WriteOptions(), "mem_key", "val");
 681   ASSERT_OK(
 682       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 683
 684   ReadOptions read_opts;
 685   read_opts.ignore_range_deletions = true;
 686   for (std::string key : {"sst_key", "imm_key", "mem_key"}) {
 687     std::string value;
 688     ASSERT_OK(db_->Get(read_opts, key, &value));
 689   }
 690   db_->ReleaseSnapshot(snapshot);
 691 }
 692
 693 TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) {
 694   const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
 695   Options opts = CurrentOptions();
 696   opts.comparator = test::Uint64Comparator();
 697   opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
 698   Reopen(opts);
 699
 700   // Write half of the keys before the tombstone and half after the tombstone.
 701   // Only covered keys (i.e., within the range and older than the tombstone)
 702   // should be deleted.
 703   for (int i = 0; i < kNum; ++i) {
 704     if (i == kNum / 2) {
 705       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
 706                        GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
 707     }
 708     db_->Put(WriteOptions(), GetNumericStr(i), "val");
 709   }
 710   ReadOptions read_opts;
 711   auto* iter = db_->NewIterator(read_opts);
 712
 713   int expected = 0;
 714   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
 715     ASSERT_EQ(GetNumericStr(expected), iter->key());
 716     if (expected == kRangeBegin - 1) {
 717       expected = kNum / 2;
 718     } else {
 719       ++expected;
 720     }
 721   }
 722   ASSERT_EQ(kNum, expected);
 723   delete iter;
 724 }
 725
 726 TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) {
 727   const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
 728   Options opts = CurrentOptions();
 729   opts.comparator = test::Uint64Comparator();
 730   opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
 731   Reopen(opts);
 732
 733   const Snapshot* snapshot = nullptr;
 734   // Put a snapshot before the range tombstone, verify an iterator using that
 735   // snapshot sees all inserted keys.
 736   for (int i = 0; i < kNum; ++i) {
 737     if (i == kNum / 2) {
 738       snapshot = db_->GetSnapshot();
 739       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
 740                        GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
 741     }
 742     db_->Put(WriteOptions(), GetNumericStr(i), "val");
 743   }
 744   ReadOptions read_opts;
 745   read_opts.snapshot = snapshot;
 746   auto* iter = db_->NewIterator(read_opts);
 747
 748   int expected = 0;
 749   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
 750     ASSERT_EQ(GetNumericStr(expected), iter->key());
 751     ++expected;
 752   }
 753   ASSERT_EQ(kNum / 2, expected);
 754   delete iter;
 755   db_->ReleaseSnapshot(snapshot);
 756 }
 757
 758 TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) {
 759   Options opts = CurrentOptions();
 760   opts.max_write_buffer_number = 4;
 761   opts.min_write_buffer_number_to_merge = 3;
 762   opts.memtable_factory.reset(new SpecialSkipListFactory(1));
 763   Reopen(opts);
 764
 765   db_->Put(WriteOptions(), "sst_key", "val");
 766   // snapshot prevents key from being deleted during flush
 767   const Snapshot* snapshot = db_->GetSnapshot();
 768   ASSERT_OK(
 769       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 770   ASSERT_OK(db_->Flush(FlushOptions()));
 771   db_->Put(WriteOptions(), "imm_key", "val");
 772   ASSERT_OK(
 773       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 774   db_->Put(WriteOptions(), "mem_key", "val");
 775   ASSERT_OK(
 776       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 777
 778   ReadOptions read_opts;
 779   read_opts.ignore_range_deletions = true;
 780   auto* iter = db_->NewIterator(read_opts);
 781   int i = 0;
 782   std::string expected[] = {"imm_key", "mem_key", "sst_key"};
 783   for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) {
 784     std::string key;
 785     ASSERT_EQ(expected[i], iter->key());
 786   }
 787   ASSERT_EQ(3, i);
 788   delete iter;
 789   db_->ReleaseSnapshot(snapshot);
 790 }
 791
 792 TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) {
 793   db_->Put(WriteOptions(), "key", "val");
 794   // snapshot prevents key from being deleted during flush
 795   const Snapshot* snapshot = db_->GetSnapshot();
 796   ASSERT_OK(
 797       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 798
 799   // iterations check unsupported in memtable, l0, and then l1
 800   for (int i = 0; i < 3; ++i) {
 801     ReadOptions read_opts;
 802     read_opts.tailing = true;
 803     auto* iter = db_->NewIterator(read_opts);
 804     if (i == 2) {
 805       // For L1+, iterators over files are created on-demand, so need seek
 806       iter->SeekToFirst();
 807     }
 808     ASSERT_TRUE(iter->status().IsNotSupported());
 809     delete iter;
 810     if (i == 0) {
 811       ASSERT_OK(db_->Flush(FlushOptions()));
 812     } else if (i == 1) {
 813       MoveFilesToLevel(1);
 814     }
 815   }
 816   db_->ReleaseSnapshot(snapshot);
 817 }
 818 #endif  // ROCKSDB_LITE
 819
 820 }  // namespace rocksdb
 821
 822 int main(int argc, char** argv) {
 823   rocksdb::port::InstallStackTraceHandler();
 824   ::testing::InitGoogleTest(&argc, argv);
 825   return RUN_ALL_TESTS();
 826 }