]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/db/db_range_del_test.cc
bump version to 18.2.4-pve3
[ceph.git] / ceph / src / rocksdb / db / db_range_del_test.cc
1 // Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5
6 #include "db/db_test_util.h"
7 #include "db/version_set.h"
8 #include "port/stack_trace.h"
9 #include "rocksdb/utilities/write_batch_with_index.h"
10 #include "test_util/testutil.h"
11 #include "util/random.h"
12 #include "utilities/merge_operators.h"
13
14 namespace ROCKSDB_NAMESPACE {
15
16 // TODO(cbi): parameterize the test to cover user-defined timestamp cases
17 class DBRangeDelTest : public DBTestBase {
18 public:
19 DBRangeDelTest() : DBTestBase("db_range_del_test", /*env_do_fsync=*/false) {}
20
21 std::string GetNumericStr(int key) {
22 uint64_t uint64_key = static_cast<uint64_t>(key);
23 std::string str;
24 str.resize(8);
25 memcpy(&str[0], static_cast<void*>(&uint64_key), 8);
26 return str;
27 }
28 };
29
30 // PlainTableFactory, WriteBatchWithIndex, and NumTableFilesAtLevel() are not
31 // supported in ROCKSDB_LITE
32 #ifndef ROCKSDB_LITE
33 TEST_F(DBRangeDelTest, NonBlockBasedTableNotSupported) {
34 // TODO: figure out why MmapReads trips the iterator pinning assertion in
35 // RangeDelAggregator. Ideally it would be supported; otherwise it should at
36 // least be explicitly unsupported.
37 for (auto config : {kPlainTableAllBytesPrefix, /* kWalDirAndMmapReads */}) {
38 option_config_ = config;
39 DestroyAndReopen(CurrentOptions());
40 ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
41 "dr1", "dr1")
42 .IsNotSupported());
43 }
44 }
45
46 TEST_F(DBRangeDelTest, WriteBatchWithIndexNotSupported) {
47 WriteBatchWithIndex indexedBatch{};
48 ASSERT_TRUE(indexedBatch.DeleteRange(db_->DefaultColumnFamily(), "dr1", "dr1")
49 .IsNotSupported());
50 ASSERT_TRUE(indexedBatch.DeleteRange("dr1", "dr1").IsNotSupported());
51 }
52
53 TEST_F(DBRangeDelTest, EndSameAsStartCoversNothing) {
54 ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
55 ASSERT_OK(
56 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "b"));
57 ASSERT_EQ("val", Get("b"));
58 }
59
60 TEST_F(DBRangeDelTest, EndComesBeforeStartInvalidArgument) {
61 ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
62 ASSERT_TRUE(
63 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "a")
64 .IsInvalidArgument());
65 ASSERT_EQ("val", Get("b"));
66 }
67
68 TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) {
69 do {
70 DestroyAndReopen(CurrentOptions());
71 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
72 "dr1", "dr2"));
73 ASSERT_OK(db_->Flush(FlushOptions()));
74 ASSERT_EQ(1, NumTableFilesAtLevel(0));
75 } while (ChangeOptions(kRangeDelSkipConfigs));
76 }
77
78 TEST_F(DBRangeDelTest, DictionaryCompressionWithOnlyRangeTombstones) {
79 Options opts = CurrentOptions();
80 opts.compression_opts.max_dict_bytes = 16384;
81 Reopen(opts);
82 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
83 "dr2"));
84 ASSERT_OK(db_->Flush(FlushOptions()));
85 }
86
87 TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) {
88 do {
89 Options opts = CurrentOptions();
90 opts.disable_auto_compactions = true;
91 opts.statistics = CreateDBStatistics();
92 DestroyAndReopen(opts);
93
94 // snapshot protects range tombstone from dropping due to becoming obsolete.
95 const Snapshot* snapshot = db_->GetSnapshot();
96 ASSERT_OK(
97 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
98 ASSERT_OK(db_->Flush(FlushOptions()));
99
100 ASSERT_EQ(1, NumTableFilesAtLevel(0));
101 ASSERT_EQ(0, NumTableFilesAtLevel(1));
102 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
103 true /* disallow_trivial_move */));
104 ASSERT_EQ(0, NumTableFilesAtLevel(0));
105 ASSERT_EQ(1, NumTableFilesAtLevel(1));
106 ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
107 db_->ReleaseSnapshot(snapshot);
108 // Skip cuckoo memtables, which do not support snapshots. Skip non-leveled
109 // compactions as the above assertions about the number of files in a level
110 // do not hold true.
111 } while (ChangeOptions(kRangeDelSkipConfigs | kSkipUniversalCompaction |
112 kSkipFIFOCompaction));
113 }
114
115 TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) {
116 // regression test for exactly filled compaction output files. Previously
117 // another file would be generated containing all range deletions, which
118 // could invalidate the non-overlapping file boundary invariant.
119 const int kNumPerFile = 4, kNumFiles = 2, kFileBytes = 9 << 10;
120 Options options = CurrentOptions();
121 options.disable_auto_compactions = true;
122 options.level0_file_num_compaction_trigger = kNumFiles;
123 options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
124 options.num_levels = 2;
125 options.target_file_size_base = kFileBytes;
126 BlockBasedTableOptions table_options;
127 table_options.block_size_deviation = 50; // each block holds two keys
128 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
129 Reopen(options);
130
131 // snapshot protects range tombstone from dropping due to becoming obsolete.
132 const Snapshot* snapshot = db_->GetSnapshot();
133 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
134 Key(1)));
135
136 Random rnd(301);
137 for (int i = 0; i < kNumFiles; ++i) {
138 std::vector<std::string> values;
139 // Write 12K (4 values, each 3K)
140 for (int j = 0; j < kNumPerFile; j++) {
141 values.push_back(rnd.RandomString(3 << 10));
142 ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
143 if (j == 0 && i > 0) {
144 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
145 }
146 }
147 }
148 // put extra key to trigger final flush
149 ASSERT_OK(Put("", ""));
150 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
151 ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
152 ASSERT_EQ(0, NumTableFilesAtLevel(1));
153
154 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
155 true /* disallow_trivial_move */));
156 ASSERT_EQ(0, NumTableFilesAtLevel(0));
157 ASSERT_EQ(2, NumTableFilesAtLevel(1));
158 db_->ReleaseSnapshot(snapshot);
159 }
160
161 TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) {
162 // Ensures range deletion spanning multiple compaction output files that are
163 // cut by max_compaction_bytes will have non-overlapping key-ranges.
164 // https://github.com/facebook/rocksdb/issues/1778
165 const int kNumFiles = 2, kNumPerFile = 1 << 8, kBytesPerVal = 1 << 12;
166 Options opts = CurrentOptions();
167 opts.comparator = test::Uint64Comparator();
168 opts.disable_auto_compactions = true;
169 opts.level0_file_num_compaction_trigger = kNumFiles;
170 opts.max_compaction_bytes = kNumPerFile * kBytesPerVal;
171 opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
172 // Want max_compaction_bytes to trigger the end of compaction output file, not
173 // target_file_size_base, so make the latter much bigger
174 // opts.target_file_size_base = 100 * opts.max_compaction_bytes;
175 opts.target_file_size_base = 1;
176 DestroyAndReopen(opts);
177
178 // snapshot protects range tombstone from dropping due to becoming obsolete.
179 const Snapshot* snapshot = db_->GetSnapshot();
180
181 Random rnd(301);
182
183 ASSERT_OK(Put(GetNumericStr(0), rnd.RandomString(kBytesPerVal)));
184 ASSERT_OK(
185 Put(GetNumericStr(kNumPerFile - 1), rnd.RandomString(kBytesPerVal)));
186 ASSERT_OK(Flush());
187 ASSERT_OK(Put(GetNumericStr(kNumPerFile), rnd.RandomString(kBytesPerVal)));
188 ASSERT_OK(
189 Put(GetNumericStr(kNumPerFile * 2 - 1), rnd.RandomString(kBytesPerVal)));
190 ASSERT_OK(Flush());
191 MoveFilesToLevel(2);
192 ASSERT_EQ(0, NumTableFilesAtLevel(0));
193 ASSERT_EQ(NumTableFilesAtLevel(2), 2);
194
195 ASSERT_OK(
196 db_->SetOptions(db_->DefaultColumnFamily(),
197 {{"target_file_size_base",
198 std::to_string(100 * opts.max_compaction_bytes)}}));
199
200 // It spans the whole key-range, thus will be included in all output files
201 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
202 GetNumericStr(0),
203 GetNumericStr(kNumFiles * kNumPerFile - 1)));
204
205 for (int i = 0; i < kNumFiles; ++i) {
206 std::vector<std::string> values;
207 // Write 1MB (256 values, each 4K)
208 for (int j = 0; j < kNumPerFile; j++) {
209 values.push_back(rnd.RandomString(kBytesPerVal));
210 ASSERT_OK(Put(GetNumericStr(kNumPerFile * i + j), values[j]));
211 }
212 // extra entry to trigger SpecialSkipListFactory's flush
213 ASSERT_OK(Put(GetNumericStr(kNumPerFile), ""));
214 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
215 ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
216 }
217
218 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr,
219 /*column_family=*/nullptr,
220 /*disallow_trivial_move=*/true));
221 ASSERT_EQ(0, NumTableFilesAtLevel(0));
222 ASSERT_GE(NumTableFilesAtLevel(1), 2);
223 std::vector<std::vector<FileMetaData>> files;
224 dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
225
226 for (size_t i = 0; i + 1 < files[1].size(); ++i) {
227 ASSERT_TRUE(InternalKeyComparator(opts.comparator)
228 .Compare(files[1][i].largest, files[1][i + 1].smallest) <
229 0);
230 }
231 db_->ReleaseSnapshot(snapshot);
232 }
233
234 TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) {
235 // Regression test for bug where sentinel range deletions (i.e., ones with
236 // sequence number of zero) were included in output files.
237 // snapshot protects range tombstone from dropping due to becoming obsolete.
238 const Snapshot* snapshot = db_->GetSnapshot();
239
240 // gaps between ranges creates sentinels in our internal representation
241 std::vector<std::pair<std::string, std::string>> range_dels = {
242 {"a", "b"}, {"c", "d"}, {"e", "f"}};
243 for (const auto& range_del : range_dels) {
244 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
245 range_del.first, range_del.second));
246 }
247 ASSERT_OK(db_->Flush(FlushOptions()));
248 ASSERT_EQ(1, NumTableFilesAtLevel(0));
249
250 std::vector<std::vector<FileMetaData>> files;
251 dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
252 ASSERT_GT(files[0][0].fd.smallest_seqno, 0);
253
254 db_->ReleaseSnapshot(snapshot);
255 }
256
257 TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) {
258 ASSERT_OK(db_->Put(WriteOptions(), "b1", "val"));
259 ASSERT_OK(
260 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
261 ASSERT_OK(db_->Put(WriteOptions(), "b2", "val"));
262 ASSERT_OK(
263 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
264 // first iteration verifies query correctness in memtable, second verifies
265 // query correctness for a single SST file
266 for (int i = 0; i < 2; ++i) {
267 if (i > 0) {
268 ASSERT_OK(db_->Flush(FlushOptions()));
269 ASSERT_EQ(1, NumTableFilesAtLevel(0));
270 }
271 std::string value;
272 ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
273 ASSERT_OK(db_->Get(ReadOptions(), "b2", &value));
274 }
275 }
276
277 TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) {
278 ASSERT_OK(db_->Put(WriteOptions(), "unused",
279 "val")); // prevents empty after compaction
280 ASSERT_OK(db_->Put(WriteOptions(), "b1", "val"));
281 ASSERT_OK(db_->Flush(FlushOptions()));
282 ASSERT_OK(
283 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
284 ASSERT_OK(db_->Flush(FlushOptions()));
285 ASSERT_OK(
286 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
287 ASSERT_OK(db_->Flush(FlushOptions()));
288 ASSERT_EQ(3, NumTableFilesAtLevel(0));
289
290 for (int i = 0; i < 2; ++i) {
291 if (i > 0) {
292 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
293 true /* disallow_trivial_move */));
294 ASSERT_EQ(0, NumTableFilesAtLevel(0));
295 ASSERT_EQ(1, NumTableFilesAtLevel(1));
296 }
297 std::string value;
298 ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
299 }
300 }
301 #endif // ROCKSDB_LITE
302
303 TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) {
304 const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250;
305 Options opts = CurrentOptions();
306 opts.comparator = test::Uint64Comparator();
307 DestroyAndReopen(opts);
308
309 // Write a third before snapshot, a third between snapshot and tombstone, and
310 // a third after the tombstone. Keys older than snapshot or newer than the
311 // tombstone should be preserved.
312 const Snapshot* snapshot = nullptr;
313 for (int i = 0; i < kNum; ++i) {
314 if (i == kNum / 3) {
315 snapshot = db_->GetSnapshot();
316 } else if (i == 2 * kNum / 3) {
317 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
318 GetNumericStr(kRangeBegin),
319 GetNumericStr(kRangeEnd)));
320 }
321 ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
322 }
323 ASSERT_OK(db_->Flush(FlushOptions()));
324
325 for (int i = 0; i < kNum; ++i) {
326 ReadOptions read_opts;
327 read_opts.ignore_range_deletions = true;
328 std::string value;
329 if (i < kRangeBegin || i > kRangeEnd || i < kNum / 3 || i >= 2 * kNum / 3) {
330 ASSERT_OK(db_->Get(read_opts, GetNumericStr(i), &value));
331 } else {
332 ASSERT_TRUE(db_->Get(read_opts, GetNumericStr(i), &value).IsNotFound());
333 }
334 }
335 db_->ReleaseSnapshot(snapshot);
336 }
337
338 // NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
339 #ifndef ROCKSDB_LITE
340 TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) {
341 const int kNumPerFile = 100, kNumFiles = 4;
342 Options opts = CurrentOptions();
343 opts.comparator = test::Uint64Comparator();
344 opts.disable_auto_compactions = true;
345 opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
346 opts.num_levels = 2;
347 opts.statistics = CreateDBStatistics();
348 DestroyAndReopen(opts);
349
350 for (int i = 0; i < kNumFiles; ++i) {
351 if (i > 0) {
352 // range tombstone covers first half of the previous file
353 ASSERT_OK(db_->DeleteRange(
354 WriteOptions(), db_->DefaultColumnFamily(),
355 GetNumericStr((i - 1) * kNumPerFile),
356 GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2)));
357 }
358 // Make sure a given key appears in each file so compaction won't be able to
359 // use trivial move, which would happen if the ranges were non-overlapping.
360 // Also, we need an extra element since flush is only triggered when the
361 // number of keys is one greater than SpecialSkipListFactory's limit.
362 // We choose a key outside the key-range used by the test to avoid conflict.
363 ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles),
364 "val"));
365
366 for (int j = 0; j < kNumPerFile; ++j) {
367 ASSERT_OK(
368 db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val"));
369 }
370 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
371 ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
372 }
373 ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
374 ASSERT_EQ(0, NumTableFilesAtLevel(0));
375 ASSERT_GT(NumTableFilesAtLevel(1), 0);
376 ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2,
377 TestGetTickerCount(opts, COMPACTION_KEY_DROP_RANGE_DEL));
378
379 for (int i = 0; i < kNumFiles; ++i) {
380 for (int j = 0; j < kNumPerFile; ++j) {
381 ReadOptions read_opts;
382 read_opts.ignore_range_deletions = true;
383 std::string value;
384 if (i == kNumFiles - 1 || j >= kNumPerFile / 2) {
385 ASSERT_OK(
386 db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value));
387 } else {
388 ASSERT_TRUE(
389 db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value)
390 .IsNotFound());
391 }
392 }
393 }
394 }
395
396 TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) {
397 const int kNumPerFile = 100, kNumFiles = 4, kFileBytes = 100 << 10;
398 Options options = CurrentOptions();
399 options.disable_auto_compactions = true;
400 options.level0_file_num_compaction_trigger = kNumFiles;
401 options.max_bytes_for_level_base = 2 * kFileBytes;
402 options.max_subcompactions = 4;
403 options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
404 options.num_levels = 3;
405 options.target_file_size_base = kFileBytes;
406 options.target_file_size_multiplier = 1;
407 options.max_compaction_bytes = 1500;
408 Reopen(options);
409
410 Random rnd(301);
411 for (int i = 0; i < 2; ++i) {
412 for (int j = 0; j < kNumFiles; ++j) {
413 if (i > 0) {
414 // delete [95,105) in two files, [295,305) in next two
415 int mid = (j + (1 - j % 2)) * kNumPerFile;
416 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
417 Key(mid - 5), Key(mid + 5)));
418 }
419 std::vector<std::string> values;
420 // Write 100KB (100 values, each 1K)
421 for (int k = 0; k < kNumPerFile; k++) {
422 values.push_back(rnd.RandomString(990));
423 ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
424 }
425 // put extra key to trigger flush
426 ASSERT_OK(Put("", ""));
427 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
428 if (j < kNumFiles - 1) {
429 // background compaction may happen early for kNumFiles'th file
430 ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
431 }
432 if (j == options.level0_file_num_compaction_trigger - 1) {
433 // When i == 1, compaction will output some files to L1, at which point
434 // L1 is not bottommost so range deletions cannot be compacted away. The
435 // new L1 files must be generated with non-overlapping key ranges even
436 // though multiple subcompactions see the same ranges deleted, else an
437 // assertion will fail.
438 //
439 // Only enable auto-compactions when we're ready; otherwise, the
440 // oversized L0 (relative to base_level) causes the compaction to run
441 // earlier.
442 ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
443 ASSERT_OK(dbfull()->TEST_WaitForCompact());
444 ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
445 {{"disable_auto_compactions", "true"}}));
446 ASSERT_EQ(NumTableFilesAtLevel(0), 0);
447 ASSERT_GT(NumTableFilesAtLevel(1), 0);
448 ASSERT_GT(NumTableFilesAtLevel(2), 0);
449 }
450 }
451 }
452 }
453
454 TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) {
455 const int kNumPerFile = 100, kFilesPerLevel = 4, kNumLevels = 4;
456 Options options = CurrentOptions();
457 options.compaction_options_universal.min_merge_width = kFilesPerLevel;
458 options.compaction_options_universal.max_merge_width = kFilesPerLevel;
459 options.compaction_options_universal.size_ratio = 10;
460 options.compaction_style = kCompactionStyleUniversal;
461 options.level0_file_num_compaction_trigger = kFilesPerLevel;
462 options.max_subcompactions = 4;
463 options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
464 options.num_levels = kNumLevels;
465 options.target_file_size_base = kNumPerFile << 10;
466 options.target_file_size_multiplier = 1;
467 Reopen(options);
468
469 Random rnd(301);
470 for (int i = 0; i < kNumLevels - 1; ++i) {
471 for (int j = 0; j < kFilesPerLevel; ++j) {
472 if (i == kNumLevels - 2) {
473 // insert range deletions [95,105) in two files, [295,305) in next two
474 // to prepare L1 for later manual compaction.
475 int mid = (j + (1 - j % 2)) * kNumPerFile;
476 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
477 Key(mid - 5), Key(mid + 5)));
478 }
479 std::vector<std::string> values;
480 // Write 100KB (100 values, each 1K)
481 for (int k = 0; k < kNumPerFile; k++) {
482 values.push_back(rnd.RandomString(990));
483 ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
484 }
485 // put extra key to trigger flush
486 ASSERT_OK(Put("", ""));
487 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
488 if (j < kFilesPerLevel - 1) {
489 // background compaction may happen early for kFilesPerLevel'th file
490 ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
491 }
492 }
493 ASSERT_OK(dbfull()->TEST_WaitForCompact());
494 ASSERT_EQ(NumTableFilesAtLevel(0), 0);
495 ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1);
496 }
497 // Now L1-L3 are full, when we compact L1->L2 we should see (1) subcompactions
498 // happen since input level > 0; (2) range deletions are not dropped since
499 // output level is not bottommost. If no file boundary assertion fails, that
500 // probably means universal compaction + subcompaction + range deletion are
501 // compatible.
502 ASSERT_OK(dbfull()->RunManualCompaction(
503 static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
504 ->cfd(),
505 1 /* input_level */, 2 /* output_level */, CompactRangeOptions(),
506 nullptr /* begin */, nullptr /* end */, true /* exclusive */,
507 true /* disallow_trivial_move */,
508 std::numeric_limits<uint64_t>::max() /* max_file_num_to_ignore */,
509 "" /*trim_ts*/));
510 }
511 #endif // ROCKSDB_LITE
512
513 TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) {
514 const int kNumPerFile = 3, kNumFiles = 3;
515 Options opts = CurrentOptions();
516 opts.disable_auto_compactions = true;
517 opts.memtable_factory.reset(test::NewSpecialSkipListFactory(2 * kNumPerFile));
518 opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
519 opts.num_levels = 2;
520 Reopen(opts);
521
522 // Iterates kNumFiles * kNumPerFile + 1 times since flushing the last file
523 // requires an extra entry.
524 for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) {
525 if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) {
526 // Delete merge operands from all but the last file
527 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
528 "key", "key_"));
529 }
530 std::string val;
531 PutFixed64(&val, i);
532 ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
533 // we need to prevent trivial move using Puts so compaction will actually
534 // process the merge operands.
535 ASSERT_OK(db_->Put(WriteOptions(), "prevent_trivial_move", ""));
536 if (i > 0 && i % kNumPerFile == 0) {
537 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
538 }
539 }
540
541 ReadOptions read_opts;
542 read_opts.ignore_range_deletions = true;
543 std::string expected, actual;
544 ASSERT_OK(db_->Get(read_opts, "key", &actual));
545 PutFixed64(&expected, 45); // 1+2+...+9
546 ASSERT_EQ(expected, actual);
547
548 ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
549
550 expected.clear();
551 ASSERT_OK(db_->Get(read_opts, "key", &actual));
552 uint64_t tmp;
553 Slice tmp2(actual);
554 GetFixed64(&tmp2, &tmp);
555 PutFixed64(&expected, 30); // 6+7+8+9 (earlier operands covered by tombstone)
556 ASSERT_EQ(expected, actual);
557 }
558
559 TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) {
560 // Test the sequence of operations: (1) Put, (2) DeleteRange, (3) Merge, (4)
561 // Flush. The `CompactionIterator` previously had a bug where we forgot to
562 // check for covering range tombstones when processing the (1) Put, causing
563 // it to reappear after the flush.
564 Options opts = CurrentOptions();
565 opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
566 Reopen(opts);
567
568 std::string val;
569 PutFixed64(&val, 1);
570 ASSERT_OK(db_->Put(WriteOptions(), "key", val));
571 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
572 "key_"));
573 ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
574 ASSERT_OK(db_->Flush(FlushOptions()));
575
576 ReadOptions read_opts;
577 std::string expected, actual;
578 ASSERT_OK(db_->Get(read_opts, "key", &actual));
579 PutFixed64(&expected, 1);
580 ASSERT_EQ(expected, actual);
581 }
582
583 // NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
584 #ifndef ROCKSDB_LITE
585 TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) {
586 // During compaction to bottommost level, verify range tombstones older than
587 // the oldest snapshot are removed, while others are preserved.
588 Options opts = CurrentOptions();
589 opts.disable_auto_compactions = true;
590 opts.num_levels = 2;
591 opts.statistics = CreateDBStatistics();
592 Reopen(opts);
593
594 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
595 "dr10")); // obsolete after compaction
596 ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
597 ASSERT_OK(db_->Flush(FlushOptions()));
598 const Snapshot* snapshot = db_->GetSnapshot();
599 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2",
600 "dr20")); // protected by snapshot
601 ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
602 ASSERT_OK(db_->Flush(FlushOptions()));
603
604 ASSERT_EQ(2, NumTableFilesAtLevel(0));
605 ASSERT_EQ(0, NumTableFilesAtLevel(1));
606 ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
607 ASSERT_EQ(0, NumTableFilesAtLevel(0));
608 ASSERT_EQ(1, NumTableFilesAtLevel(1));
609 ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
610
611 db_->ReleaseSnapshot(snapshot);
612 }
613
614 TEST_F(DBRangeDelTest, TableEvictedDuringScan) {
615 // The RangeDelAggregator holds pointers into range deletion blocks created by
616 // table readers. This test ensures the aggregator can still access those
617 // blocks even if it outlives the table readers that created them.
618 //
619 // DBIter always keeps readers open for L0 files. So, in order to test
620 // aggregator outliving reader, we need to have deletions in L1 files, which
621 // are opened/closed on-demand during the scan. This is accomplished by
622 // setting kNumRanges > level0_stop_writes_trigger, which prevents deletions
623 // from all lingering in L0 (there is at most one range deletion per L0 file).
624 //
625 // The first L1 file will contain a range deletion since its begin key is 0.
626 // SeekToFirst() references that table's reader and adds its range tombstone
627 // to the aggregator. Upon advancing beyond that table's key-range via Next(),
628 // the table reader will be unreferenced by the iterator. Since we manually
629 // call Evict() on all readers before the full scan, this unreference causes
630 // the reader's refcount to drop to zero and thus be destroyed.
631 //
632 // When it is destroyed, we do not remove its range deletions from the
633 // aggregator. So, subsequent calls to Next() must be able to use these
634 // deletions to decide whether a key is covered. This will work as long as
635 // the aggregator properly references the range deletion block.
636 const int kNum = 25, kRangeBegin = 0, kRangeEnd = 7, kNumRanges = 5;
637 Options opts = CurrentOptions();
638 opts.comparator = test::Uint64Comparator();
639 opts.level0_file_num_compaction_trigger = 4;
640 opts.level0_stop_writes_trigger = 4;
641 opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
642 opts.num_levels = 2;
643 BlockBasedTableOptions bbto;
644 bbto.cache_index_and_filter_blocks = true;
645 bbto.block_cache = NewLRUCache(8 << 20);
646 opts.table_factory.reset(NewBlockBasedTableFactory(bbto));
647 DestroyAndReopen(opts);
648
649 // Hold a snapshot so range deletions can't become obsolete during compaction
650 // to bottommost level (i.e., L1).
651 const Snapshot* snapshot = db_->GetSnapshot();
652 for (int i = 0; i < kNum; ++i) {
653 ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
654 if (i > 0) {
655 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
656 }
657 if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) {
658 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
659 GetNumericStr(kRangeBegin),
660 GetNumericStr(kRangeEnd)));
661 }
662 }
663 // Must be > 1 so the first L1 file can be closed before scan finishes
664 ASSERT_OK(dbfull()->TEST_WaitForCompact());
665 ASSERT_GT(NumTableFilesAtLevel(1), 1);
666 std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
667
668 ReadOptions read_opts;
669 auto* iter = db_->NewIterator(read_opts);
670 ASSERT_OK(iter->status());
671 int expected = kRangeEnd;
672 iter->SeekToFirst();
673 for (auto file_number : file_numbers) {
674 // This puts table caches in the state of being externally referenced only
675 // so they are destroyed immediately upon iterator unreferencing.
676 TableCache::Evict(dbfull()->TEST_table_cache(), file_number);
677 }
678 for (; iter->Valid(); iter->Next()) {
679 ASSERT_EQ(GetNumericStr(expected), iter->key());
680 ++expected;
681 // Keep clearing block cache's LRU so range deletion block can be freed as
682 // soon as its refcount drops to zero.
683 bbto.block_cache->EraseUnRefEntries();
684 }
685 ASSERT_EQ(kNum, expected);
686 delete iter;
687 db_->ReleaseSnapshot(snapshot);
688
689 // Also test proper cache handling in GetRangeTombstoneIterator,
690 // via TablesRangeTombstoneSummary. (This once triggered memory leak
691 // report with ASAN.)
692 opts.max_open_files = 1;
693 Reopen(opts);
694
695 std::string str;
696 ASSERT_OK(dbfull()->TablesRangeTombstoneSummary(db_->DefaultColumnFamily(),
697 100, &str));
698 }
699
700 TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) {
701 do {
702 DestroyAndReopen(CurrentOptions());
703 ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
704 ASSERT_OK(
705 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
706
707 ReadOptions read_opts;
708 std::string value;
709 ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
710 } while (ChangeOptions(kRangeDelSkipConfigs));
711 }
712
713 TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) {
714 do {
715 Options opts = CurrentOptions();
716 opts.max_write_buffer_number = 3;
717 opts.min_write_buffer_number_to_merge = 2;
718 // SpecialSkipListFactory lets us specify maximum number of elements the
719 // memtable can hold. It switches the active memtable to immutable (flush is
720 // prevented by the above options) upon inserting an element that would
721 // overflow the memtable.
722 opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
723 DestroyAndReopen(opts);
724
725 ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
726 ASSERT_OK(
727 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
728 ASSERT_OK(db_->Put(WriteOptions(), "blah", "val"));
729
730 ReadOptions read_opts;
731 std::string value;
732 ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
733 } while (ChangeOptions(kRangeDelSkipConfigs));
734 }
735
736 TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) {
737 do {
738 DestroyAndReopen(CurrentOptions());
739 ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
740 // snapshot prevents key from being deleted during flush
741 const Snapshot* snapshot = db_->GetSnapshot();
742 ASSERT_OK(
743 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
744 ASSERT_OK(db_->Flush(FlushOptions()));
745
746 ReadOptions read_opts;
747 std::string value;
748 ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
749 db_->ReleaseSnapshot(snapshot);
750 } while (ChangeOptions(kRangeDelSkipConfigs));
751 }
752
753 TEST_F(DBRangeDelTest, GetCoveredMergeOperandFromMemtable) {
754 const int kNumMergeOps = 10;
755 Options opts = CurrentOptions();
756 opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
757 Reopen(opts);
758
759 for (int i = 0; i < kNumMergeOps; ++i) {
760 std::string val;
761 PutFixed64(&val, i);
762 ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
763 if (i == kNumMergeOps / 2) {
764 // deletes [0, 5]
765 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
766 "key", "key_"));
767 }
768 }
769
770 ReadOptions read_opts;
771 std::string expected, actual;
772 ASSERT_OK(db_->Get(read_opts, "key", &actual));
773 PutFixed64(&expected, 30); // 6+7+8+9
774 ASSERT_EQ(expected, actual);
775
776 expected.clear();
777 read_opts.ignore_range_deletions = true;
778 ASSERT_OK(db_->Get(read_opts, "key", &actual));
779 PutFixed64(&expected, 45); // 0+1+2+...+9
780 ASSERT_EQ(expected, actual);
781 }
782
783 TEST_F(DBRangeDelTest, GetIgnoresRangeDeletions) {
784 Options opts = CurrentOptions();
785 opts.max_write_buffer_number = 4;
786 opts.min_write_buffer_number_to_merge = 3;
787 opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
788 Reopen(opts);
789
790 ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val"));
791 // snapshot prevents key from being deleted during flush
792 const Snapshot* snapshot = db_->GetSnapshot();
793 ASSERT_OK(
794 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
795 ASSERT_OK(db_->Flush(FlushOptions()));
796 ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val"));
797 ASSERT_OK(
798 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
799 ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val"));
800 ASSERT_OK(
801 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
802
803 ReadOptions read_opts;
804 read_opts.ignore_range_deletions = true;
805 for (std::string key : {"sst_key", "imm_key", "mem_key"}) {
806 std::string value;
807 ASSERT_OK(db_->Get(read_opts, key, &value));
808 }
809 db_->ReleaseSnapshot(snapshot);
810 }
811
812 TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) {
813 const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
814 Options opts = CurrentOptions();
815 opts.comparator = test::Uint64Comparator();
816 opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
817 DestroyAndReopen(opts);
818
819 // Write half of the keys before the tombstone and half after the tombstone.
820 // Only covered keys (i.e., within the range and older than the tombstone)
821 // should be deleted.
822 for (int i = 0; i < kNum; ++i) {
823 if (i == kNum / 2) {
824 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
825 GetNumericStr(kRangeBegin),
826 GetNumericStr(kRangeEnd)));
827 }
828 ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
829 }
830 ReadOptions read_opts;
831 auto* iter = db_->NewIterator(read_opts);
832 ASSERT_OK(iter->status());
833
834 int expected = 0;
835 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
836 ASSERT_EQ(GetNumericStr(expected), iter->key());
837 if (expected == kRangeBegin - 1) {
838 expected = kNum / 2;
839 } else {
840 ++expected;
841 }
842 }
843 ASSERT_EQ(kNum, expected);
844 delete iter;
845 }
846
847 TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) {
848 const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
849 Options opts = CurrentOptions();
850 opts.comparator = test::Uint64Comparator();
851 opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
852 DestroyAndReopen(opts);
853
854 const Snapshot* snapshot = nullptr;
855 // Put a snapshot before the range tombstone, verify an iterator using that
856 // snapshot sees all inserted keys.
857 for (int i = 0; i < kNum; ++i) {
858 if (i == kNum / 2) {
859 snapshot = db_->GetSnapshot();
860 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
861 GetNumericStr(kRangeBegin),
862 GetNumericStr(kRangeEnd)));
863 }
864 ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
865 }
866 ReadOptions read_opts;
867 read_opts.snapshot = snapshot;
868 auto* iter = db_->NewIterator(read_opts);
869 ASSERT_OK(iter->status());
870
871 int expected = 0;
872 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
873 ASSERT_EQ(GetNumericStr(expected), iter->key());
874 ++expected;
875 }
876 ASSERT_EQ(kNum / 2, expected);
877 delete iter;
878 db_->ReleaseSnapshot(snapshot);
879 }
880
881 TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) {
882 Options opts = CurrentOptions();
883 opts.max_write_buffer_number = 4;
884 opts.min_write_buffer_number_to_merge = 3;
885 opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
886 Reopen(opts);
887
888 ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val"));
889 // snapshot prevents key from being deleted during flush
890 const Snapshot* snapshot = db_->GetSnapshot();
891 ASSERT_OK(
892 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
893 ASSERT_OK(db_->Flush(FlushOptions()));
894 ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val"));
895 ASSERT_OK(
896 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
897 ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val"));
898 ASSERT_OK(
899 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
900
901 ReadOptions read_opts;
902 read_opts.ignore_range_deletions = true;
903 auto* iter = db_->NewIterator(read_opts);
904 ASSERT_OK(iter->status());
905 int i = 0;
906 std::string expected[] = {"imm_key", "mem_key", "sst_key"};
907 for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) {
908 std::string key;
909 ASSERT_EQ(expected[i], iter->key());
910 }
911 ASSERT_EQ(3, i);
912 delete iter;
913 db_->ReleaseSnapshot(snapshot);
914 }
915
916 #ifndef ROCKSDB_UBSAN_RUN
917 TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) {
918 ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
919 // snapshot prevents key from being deleted during flush
920 const Snapshot* snapshot = db_->GetSnapshot();
921 ASSERT_OK(
922 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
923
924 // iterations check unsupported in memtable, l0, and then l1
925 for (int i = 0; i < 3; ++i) {
926 ReadOptions read_opts;
927 read_opts.tailing = true;
928 auto* iter = db_->NewIterator(read_opts);
929 if (i == 2) {
930 // For L1+, iterators over files are created on-demand, so need seek
931 iter->SeekToFirst();
932 }
933 ASSERT_TRUE(iter->status().IsNotSupported());
934
935 delete iter;
936 if (i == 0) {
937 ASSERT_OK(db_->Flush(FlushOptions()));
938 } else if (i == 1) {
939 MoveFilesToLevel(1);
940 }
941 }
942 db_->ReleaseSnapshot(snapshot);
943 }
944 #endif // !ROCKSDB_UBSAN_RUN
945
946 TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) {
947 const int kNumFiles = 2, kNumKeysPerFile = 4;
948 Options options = CurrentOptions();
949 options.compression = kNoCompression;
950 options.disable_auto_compactions = true;
951 options.level0_file_num_compaction_trigger = kNumFiles;
952 options.max_subcompactions = 2;
953 options.num_levels = 2;
954 options.target_file_size_base = 4096;
955 Reopen(options);
956
957 // need a L1 file for subcompaction to be triggered
958 ASSERT_OK(
959 db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(0), "val"));
960 ASSERT_OK(db_->Flush(FlushOptions()));
961 MoveFilesToLevel(1);
962
963 // put enough keys to fill up the first subcompaction, and later range-delete
964 // them so that the first subcompaction outputs no key-values. In that case
965 // it'll consider making an SST file dedicated to range deletions.
966 for (int i = 0; i < kNumKeysPerFile; ++i) {
967 ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i),
968 std::string(1024, 'a')));
969 }
970 ASSERT_OK(db_->Flush(FlushOptions()));
971 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
972 Key(kNumKeysPerFile)));
973
974 // the above range tombstone can be dropped, so that one alone won't cause a
975 // dedicated file to be opened. We can make one protected by snapshot that
976 // must be considered. Make its range outside the first subcompaction's range
977 // to exercise the tricky part of the code.
978 const Snapshot* snapshot = db_->GetSnapshot();
979 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
980 Key(kNumKeysPerFile + 1),
981 Key(kNumKeysPerFile + 2)));
982 ASSERT_OK(db_->Flush(FlushOptions()));
983
984 ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
985 ASSERT_EQ(1, NumTableFilesAtLevel(1));
986
987 ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
988 ASSERT_OK(dbfull()->TEST_WaitForCompact());
989 db_->ReleaseSnapshot(snapshot);
990 }
991
992 TEST_F(DBRangeDelTest, MemtableBloomFilter) {
993 // regression test for #2743. the range delete tombstones in memtable should
994 // be added even when Get() skips searching due to its prefix bloom filter
995 const int kMemtableSize = 1 << 20; // 1MB
996 const int kMemtablePrefixFilterSize = 1 << 13; // 8KB
997 const int kNumKeys = 1000;
998 const int kPrefixLen = 8;
999 Options options = CurrentOptions();
1000 options.memtable_prefix_bloom_size_ratio =
1001 static_cast<double>(kMemtablePrefixFilterSize) / kMemtableSize;
1002 options.prefix_extractor.reset(
1003 ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen));
1004 options.write_buffer_size = kMemtableSize;
1005 Reopen(options);
1006
1007 for (int i = 0; i < kNumKeys; ++i) {
1008 ASSERT_OK(Put(Key(i), "val"));
1009 }
1010 ASSERT_OK(Flush());
1011 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
1012 Key(kNumKeys)));
1013 for (int i = 0; i < kNumKeys; ++i) {
1014 std::string value;
1015 ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
1016 }
1017 }
1018
1019 TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
1020 // This test originally verified that compaction treated files containing a
1021 // split range deletion in the input level as an atomic unit. I.e.,
1022 // compacting any input-level file(s) containing a portion of the range
1023 // deletion causes all other input-level files containing portions of that
1024 // same range deletion to be included in the compaction. Range deletion
1025 // tombstones are now truncated to sstable boundaries which removed the need
1026 // for that behavior (which could lead to excessively large
1027 // compactions).
1028 const int kNumFilesPerLevel = 4, kValueBytes = 4 << 10;
1029 Options options = CurrentOptions();
1030 options.compression = kNoCompression;
1031 options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
1032 options.memtable_factory.reset(
1033 test::NewSpecialSkipListFactory(2 /* num_entries_flush */));
1034 // max file size could be 2x of target file size, so set it to half of that
1035 options.target_file_size_base = kValueBytes / 2;
1036 // disable dynamic_file_size, as it will cut L1 files into more files (than
1037 // kNumFilesPerLevel).
1038 options.level_compaction_dynamic_file_size = false;
1039 options.max_compaction_bytes = 1500;
1040 // i == 0: CompactFiles
1041 // i == 1: CompactRange
1042 // i == 2: automatic compaction
1043 for (int i = 0; i < 3; ++i) {
1044 DestroyAndReopen(options);
1045
1046 ASSERT_OK(Put(Key(0), ""));
1047 ASSERT_OK(db_->Flush(FlushOptions()));
1048 MoveFilesToLevel(2);
1049 ASSERT_EQ(1, NumTableFilesAtLevel(2));
1050
1051 // snapshot protects range tombstone from dropping due to becoming obsolete.
1052 const Snapshot* snapshot = db_->GetSnapshot();
1053 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
1054 Key(0), Key(2 * kNumFilesPerLevel)));
1055
1056 Random rnd(301);
1057 std::string value = rnd.RandomString(kValueBytes);
1058 for (int j = 0; j < kNumFilesPerLevel; ++j) {
1059 // give files overlapping key-ranges to prevent trivial move
1060 ASSERT_OK(Put(Key(j), value));
1061 ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
1062 if (j > 0) {
1063 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
1064 ASSERT_EQ(j, NumTableFilesAtLevel(0));
1065 }
1066 }
1067 // put extra key to trigger final flush
1068 ASSERT_OK(Put("", ""));
1069 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
1070 ASSERT_OK(dbfull()->TEST_WaitForCompact());
1071 ASSERT_EQ(0, NumTableFilesAtLevel(0));
1072 ASSERT_EQ(kNumFilesPerLevel, NumTableFilesAtLevel(1));
1073
1074 ColumnFamilyMetaData meta;
1075 db_->GetColumnFamilyMetaData(&meta);
1076 if (i == 0) {
1077 ASSERT_OK(db_->CompactFiles(
1078 CompactionOptions(), {meta.levels[1].files[0].name}, 2 /* level */));
1079 ASSERT_EQ(0, NumTableFilesAtLevel(1));
1080 } else if (i == 1) {
1081 auto begin_str = Key(0), end_str = Key(1);
1082 Slice begin = begin_str, end = end_str;
1083 ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin, &end));
1084 ASSERT_EQ(3, NumTableFilesAtLevel(1));
1085 } else if (i == 2) {
1086 ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
1087 {{"max_bytes_for_level_base", "10000"}}));
1088 ASSERT_OK(dbfull()->TEST_WaitForCompact());
1089 ASSERT_EQ(1, NumTableFilesAtLevel(1));
1090 }
1091 ASSERT_GT(NumTableFilesAtLevel(2), 0);
1092
1093 db_->ReleaseSnapshot(snapshot);
1094 }
1095 }
1096
1097 TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
1098 // Test the handling of the range-tombstone end-key as the
1099 // upper-bound for an sstable.
1100
1101 const int kNumFilesPerLevel = 2, kValueBytes = 4 << 10;
1102 Options options = CurrentOptions();
1103 options.compression = kNoCompression;
1104 options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
1105 options.memtable_factory.reset(
1106 test::NewSpecialSkipListFactory(2 /* num_entries_flush */));
1107 options.target_file_size_base = kValueBytes;
1108 options.disable_auto_compactions = true;
1109 // disable it for now, otherwise the L1 files are going be cut before data 1:
1110 // L1: [0] [1,4]
1111 // L2: [0,0]
1112 // because the grandparent file is between [0]->[1] and it's size is more than
1113 // 1/8 of target size (4k).
1114 options.level_compaction_dynamic_file_size = false;
1115
1116 DestroyAndReopen(options);
1117
1118 // Create an initial sstable at L2:
1119 // [key000000#1,1, key000000#1,1]
1120 ASSERT_OK(Put(Key(0), ""));
1121 ASSERT_OK(db_->Flush(FlushOptions()));
1122 MoveFilesToLevel(2);
1123 ASSERT_EQ(1, NumTableFilesAtLevel(2));
1124
1125 // A snapshot protects the range tombstone from dropping due to
1126 // becoming obsolete.
1127 const Snapshot* snapshot = db_->GetSnapshot();
1128 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
1129 Key(2 * kNumFilesPerLevel)));
1130
1131 // Create 2 additional sstables in L0. Note that the first sstable
1132 // contains the range tombstone.
1133 // [key000000#3,1, key000004#72057594037927935,15]
1134 // [key000001#5,1, key000002#6,1]
1135 Random rnd(301);
1136 std::string value = rnd.RandomString(kValueBytes);
1137 for (int j = 0; j < kNumFilesPerLevel; ++j) {
1138 // Give files overlapping key-ranges to prevent a trivial move when we
1139 // compact from L0 to L1.
1140 ASSERT_OK(Put(Key(j), value));
1141 ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
1142 ASSERT_OK(db_->Flush(FlushOptions()));
1143 ASSERT_EQ(j + 1, NumTableFilesAtLevel(0));
1144 }
1145 // Compact the 2 L0 sstables to L1, resulting in the following LSM. There
1146 // are 2 sstables generated in L1 due to the target_file_size_base setting.
1147 // L1:
1148 // [key000000#3,1, key000002#72057594037927935,15]
1149 // [key000002#6,1, key000004#72057594037927935,15]
1150 // L2:
1151 // [key000000#1,1, key000000#1,1]
1152 MoveFilesToLevel(1);
1153 ASSERT_EQ(2, NumTableFilesAtLevel(1));
1154
1155 {
1156 // Compact the second sstable in L1:
1157 // L1:
1158 // [key000000#3,1, key000002#72057594037927935,15]
1159 // L2:
1160 // [key000000#1,1, key000000#1,1]
1161 // [key000002#6,1, key000004#72057594037927935,15]
1162 //
1163 // At the same time, verify the compaction does not cause the key at the
1164 // endpoint (key000002#6,1) to disappear.
1165 ASSERT_EQ(value, Get(Key(2)));
1166 auto begin_str = Key(3);
1167 const ROCKSDB_NAMESPACE::Slice begin = begin_str;
1168 ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, nullptr));
1169 ASSERT_EQ(1, NumTableFilesAtLevel(1));
1170 ASSERT_EQ(2, NumTableFilesAtLevel(2));
1171 ASSERT_EQ(value, Get(Key(2)));
1172 }
1173
1174 {
1175 // Compact the first sstable in L1. This should be copacetic, but
1176 // was previously resulting in overlapping sstables in L2 due to
1177 // mishandling of the range tombstone end-key when used as the
1178 // largest key for an sstable. The resulting LSM structure should
1179 // be:
1180 //
1181 // L2:
1182 // [key000000#1,1, key000001#72057594037927935,15]
1183 // [key000001#5,1, key000002#72057594037927935,15]
1184 // [key000002#6,1, key000004#72057594037927935,15]
1185 auto begin_str = Key(0);
1186 const ROCKSDB_NAMESPACE::Slice begin = begin_str;
1187 ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, &begin));
1188 ASSERT_EQ(0, NumTableFilesAtLevel(1));
1189 ASSERT_EQ(3, NumTableFilesAtLevel(2));
1190 }
1191
1192 db_->ReleaseSnapshot(snapshot);
1193 }
1194
1195 TEST_F(DBRangeDelTest, UnorderedTombstones) {
1196 // Regression test for #2752. Range delete tombstones between
1197 // different snapshot stripes are not stored in order, so the first
1198 // tombstone of each snapshot stripe should be checked as a smallest
1199 // candidate.
1200 Options options = CurrentOptions();
1201 DestroyAndReopen(options);
1202
1203 auto cf = db_->DefaultColumnFamily();
1204
1205 ASSERT_OK(db_->Put(WriteOptions(), cf, "a", "a"));
1206 ASSERT_OK(db_->Flush(FlushOptions(), cf));
1207 ASSERT_EQ(1, NumTableFilesAtLevel(0));
1208 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
1209 ASSERT_EQ(1, NumTableFilesAtLevel(1));
1210
1211 ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "b", "c"));
1212 // Hold a snapshot to separate these two delete ranges.
1213 auto snapshot = db_->GetSnapshot();
1214 ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "a", "b"));
1215 ASSERT_OK(db_->Flush(FlushOptions(), cf));
1216 db_->ReleaseSnapshot(snapshot);
1217
1218 std::vector<std::vector<FileMetaData>> files;
1219 dbfull()->TEST_GetFilesMetaData(cf, &files);
1220 ASSERT_EQ(1, files[0].size());
1221 ASSERT_EQ("a", files[0][0].smallest.user_key());
1222 ASSERT_EQ("c", files[0][0].largest.user_key());
1223
1224 std::string v;
1225 auto s = db_->Get(ReadOptions(), "a", &v);
1226 ASSERT_TRUE(s.IsNotFound());
1227 }
1228
1229 class MockMergeOperator : public MergeOperator {
1230 // Mock non-associative operator. Non-associativity is expressed by lack of
1231 // implementation for any `PartialMerge*` functions.
1232 public:
1233 bool FullMergeV2(const MergeOperationInput& merge_in,
1234 MergeOperationOutput* merge_out) const override {
1235 assert(merge_out != nullptr);
1236 merge_out->new_value = merge_in.operand_list.back().ToString();
1237 return true;
1238 }
1239
1240 const char* Name() const override { return "MockMergeOperator"; }
1241 };
1242
1243 TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) {
1244 // This test uses a non-associative merge operator since that is a convenient
1245 // way to get compaction to write out files with overlapping user-keys at the
1246 // endpoints. Note, however, overlapping endpoints can also occur with other
1247 // value types (Put, etc.), assuming the right snapshots are present.
1248 const int kFileBytes = 1 << 20;
1249 const int kValueBytes = 1 << 10;
1250 const int kNumFiles = 4;
1251
1252 Options options = CurrentOptions();
1253 options.compression = kNoCompression;
1254 options.disable_auto_compactions = true;
1255 options.merge_operator.reset(new MockMergeOperator());
1256 options.target_file_size_base = kFileBytes;
1257 Reopen(options);
1258
1259 // Push dummy data to L3 so that our actual test files on L0-L2
1260 // will not be considered "bottommost" level, otherwise compaction
1261 // may prevent us from creating overlapping user keys
1262 // as on the bottommost layer MergeHelper
1263 ASSERT_OK(db_->Merge(WriteOptions(), "key", "dummy"));
1264 ASSERT_OK(db_->Flush(FlushOptions()));
1265 MoveFilesToLevel(3);
1266
1267 Random rnd(301);
1268 const Snapshot* snapshot = nullptr;
1269 for (int i = 0; i < kNumFiles; ++i) {
1270 for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
1271 auto value = rnd.RandomString(kValueBytes);
1272 ASSERT_OK(db_->Merge(WriteOptions(), "key", value));
1273 }
1274 if (i == kNumFiles - 1) {
1275 // Take snapshot to prevent covered merge operands from being dropped by
1276 // compaction.
1277 snapshot = db_->GetSnapshot();
1278 // The DeleteRange is the last write so all merge operands are covered.
1279 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
1280 "key", "key_"));
1281 }
1282 ASSERT_OK(db_->Flush(FlushOptions()));
1283 }
1284 ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
1285 std::string value;
1286 ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
1287
1288 ASSERT_OK(dbfull()->TEST_CompactRange(
1289 0 /* level */, nullptr /* begin */, nullptr /* end */,
1290 nullptr /* column_family */, true /* disallow_trivial_move */));
1291 ASSERT_EQ(0, NumTableFilesAtLevel(0));
1292 // Now we have multiple files at L1 all containing a single user key, thus
1293 // guaranteeing overlap in the file endpoints.
1294 ASSERT_GT(NumTableFilesAtLevel(1), 1);
1295
1296 // Verify no merge operands reappeared after the compaction.
1297 ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
1298
1299 // Compact and verify again. It's worthwhile because now the files have
1300 // tighter endpoints, so we can verify that doesn't mess anything up.
1301 ASSERT_OK(dbfull()->TEST_CompactRange(
1302 1 /* level */, nullptr /* begin */, nullptr /* end */,
1303 nullptr /* column_family */, true /* disallow_trivial_move */));
1304 ASSERT_GT(NumTableFilesAtLevel(2), 1);
1305 ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
1306
1307 db_->ReleaseSnapshot(snapshot);
1308 }
1309
1310 TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) {
1311 // Verify a key newer than a range tombstone cannot be deleted by being
1312 // compacted to the bottom level (and thus having its seqnum zeroed) before
1313 // the range tombstone. This used to happen when range tombstones were
1314 // untruncated on reads such that they extended past their file boundaries.
1315 //
1316 // Test summary:
1317 //
1318 // - L1 is bottommost.
1319 // - A couple snapshots are strategically taken to prevent seqnums from being
1320 // zeroed, range tombstone from being dropped, merge operands from being
1321 // dropped, and merge operands from being combined.
1322 // - Left half of files in L1 all have same user key, ensuring their file
1323 // boundaries overlap. In the past this would cause range tombstones to be
1324 // untruncated.
1325 // - Right half of L1 files all have different keys, ensuring no overlap.
1326 // - A range tombstone spans all L1 keys, so it is stored in every L1 file.
1327 // - Keys in the right side of the key-range are overwritten. These are
1328 // compacted down to L1 after releasing snapshots such that their seqnums
1329 // will be zeroed.
1330 // - A full range scan is performed. If the tombstone in the left L1 files
1331 // were untruncated, it would now cover keys newer than it (but with zeroed
1332 // seqnums) in the right L1 files.
1333 const int kFileBytes = 1 << 20;
1334 const int kValueBytes = 1 << 10;
1335 const int kNumFiles = 4;
1336 const int kMaxKey = kNumFiles * kFileBytes / kValueBytes;
1337 const int kKeysOverwritten = 10;
1338
1339 Options options = CurrentOptions();
1340 options.compression = kNoCompression;
1341 options.disable_auto_compactions = true;
1342 options.merge_operator.reset(new MockMergeOperator());
1343 options.num_levels = 2;
1344 options.target_file_size_base = kFileBytes;
1345 Reopen(options);
1346
1347 Random rnd(301);
1348 // - snapshots[0] prevents merge operands from being combined during
1349 // compaction.
1350 // - snapshots[1] prevents merge operands from being dropped due to the
1351 // covering range tombstone.
1352 const Snapshot* snapshots[] = {nullptr, nullptr};
1353 for (int i = 0; i < kNumFiles; ++i) {
1354 for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
1355 auto value = rnd.RandomString(kValueBytes);
1356 std::string key;
1357 if (i < kNumFiles / 2) {
1358 key = Key(0);
1359 } else {
1360 key = Key(1 + i * kFileBytes / kValueBytes + j);
1361 }
1362 ASSERT_OK(db_->Merge(WriteOptions(), key, value));
1363 }
1364 if (i == 0) {
1365 snapshots[0] = db_->GetSnapshot();
1366 }
1367 if (i == kNumFiles - 1) {
1368 snapshots[1] = db_->GetSnapshot();
1369 // The DeleteRange is the last write so all merge operands are covered.
1370 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
1371 Key(0), Key(kMaxKey + 1)));
1372 }
1373 ASSERT_OK(db_->Flush(FlushOptions()));
1374 }
1375 ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
1376
1377 auto get_key_count = [this]() -> int {
1378 auto* iter = db_->NewIterator(ReadOptions());
1379 assert(iter->status().ok());
1380 iter->SeekToFirst();
1381 int keys_found = 0;
1382 for (; iter->Valid(); iter->Next()) {
1383 ++keys_found;
1384 }
1385 delete iter;
1386 return keys_found;
1387 };
1388
1389 // All keys should be covered
1390 ASSERT_EQ(0, get_key_count());
1391
1392 ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
1393 nullptr /* end_key */));
1394 ASSERT_EQ(0, NumTableFilesAtLevel(0));
1395 // Roughly the left half of L1 files should have overlapping boundary keys,
1396 // while the right half should not.
1397 ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
1398
1399 // Now overwrite a few keys that are in L1 files that definitely don't have
1400 // overlapping boundary keys.
1401 for (int i = kMaxKey; i > kMaxKey - kKeysOverwritten; --i) {
1402 auto value = rnd.RandomString(kValueBytes);
1403 ASSERT_OK(db_->Merge(WriteOptions(), Key(i), value));
1404 }
1405 ASSERT_OK(db_->Flush(FlushOptions()));
1406
1407 // The overwritten keys are in L0 now, so clearly aren't covered by the range
1408 // tombstone in L1.
1409 ASSERT_EQ(kKeysOverwritten, get_key_count());
1410
1411 // Release snapshots so seqnums can be zeroed when L0->L1 happens.
1412 db_->ReleaseSnapshot(snapshots[0]);
1413 db_->ReleaseSnapshot(snapshots[1]);
1414
1415 auto begin_key_storage = Key(kMaxKey - kKeysOverwritten + 1);
1416 auto end_key_storage = Key(kMaxKey);
1417 Slice begin_key(begin_key_storage);
1418 Slice end_key(end_key_storage);
1419 ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_key, &end_key));
1420 ASSERT_EQ(0, NumTableFilesAtLevel(0));
1421 ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
1422
1423 ASSERT_EQ(kKeysOverwritten, get_key_count());
1424 }
1425
1426 TEST_F(DBRangeDelTest, DeletedMergeOperandReappearsIterPrev) {
1427 // Exposes a bug where we were using
1428 // `RangeDelPositioningMode::kBackwardTraversal` while scanning merge operands
1429 // in the forward direction. Confusingly, this case happened during
1430 // `DBIter::Prev`. It could cause assertion failure, or reappearing keys.
1431 const int kFileBytes = 1 << 20;
1432 const int kValueBytes = 1 << 10;
1433 // Need multiple keys so we can get results when calling `Prev()` after
1434 // `SeekToLast()`.
1435 const int kNumKeys = 3;
1436 const int kNumFiles = 4;
1437
1438 Options options = CurrentOptions();
1439 options.compression = kNoCompression;
1440 options.disable_auto_compactions = true;
1441 options.merge_operator.reset(new MockMergeOperator());
1442 options.target_file_size_base = kFileBytes;
1443 Reopen(options);
1444
1445 Random rnd(301);
1446 const Snapshot* snapshot = nullptr;
1447 for (int i = 0; i < kNumFiles; ++i) {
1448 for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
1449 auto value = rnd.RandomString(kValueBytes);
1450 ASSERT_OK(db_->Merge(WriteOptions(), Key(j % kNumKeys), value));
1451 if (i == 0 && j == kNumKeys) {
1452 // Take snapshot to prevent covered merge operands from being dropped or
1453 // merged by compaction.
1454 snapshot = db_->GetSnapshot();
1455 // Do a DeleteRange near the beginning so only the oldest merge operand
1456 // for each key is covered. This ensures the sequence of events:
1457 //
1458 // - `DBIter::Prev()` is called
1459 // - After several same versions of the same user key are encountered,
1460 // it decides to seek using `DBIter::FindValueForCurrentKeyUsingSeek`.
1461 // - Binary searches to the newest version of the key, which is in the
1462 // leftmost file containing the user key.
1463 // - Scans forwards to collect all merge operands. Eventually reaches
1464 // the rightmost file containing the oldest merge operand, which
1465 // should be covered by the `DeleteRange`. If `RangeDelAggregator`
1466 // were not properly using `kForwardTraversal` here, that operand
1467 // would reappear.
1468 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
1469 Key(0), Key(kNumKeys + 1)));
1470 }
1471 }
1472 ASSERT_OK(db_->Flush(FlushOptions()));
1473 }
1474 ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
1475
1476 ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
1477 nullptr /* end_key */));
1478 ASSERT_EQ(0, NumTableFilesAtLevel(0));
1479 ASSERT_GT(NumTableFilesAtLevel(1), 1);
1480
1481 auto* iter = db_->NewIterator(ReadOptions());
1482 ASSERT_OK(iter->status());
1483 iter->SeekToLast();
1484 int keys_found = 0;
1485 for (; iter->Valid(); iter->Prev()) {
1486 ++keys_found;
1487 }
1488 delete iter;
1489 ASSERT_EQ(kNumKeys, keys_found);
1490
1491 db_->ReleaseSnapshot(snapshot);
1492 }
1493
1494 TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
1495 const int kFileBytes = 1 << 20;
1496
1497 Options options = CurrentOptions();
1498 options.compression = kNoCompression;
1499 options.disable_auto_compactions = true;
1500 options.target_file_size_base = kFileBytes;
1501 Reopen(options);
1502
1503 ASSERT_OK(Put(Key(0), "a"));
1504 const Snapshot* snapshot = db_->GetSnapshot();
1505
1506 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
1507 Key(10)));
1508
1509 ASSERT_OK(db_->Flush(FlushOptions()));
1510
1511 ReadOptions read_opts;
1512 read_opts.snapshot = snapshot;
1513 auto* iter = db_->NewIterator(read_opts);
1514 ASSERT_OK(iter->status());
1515
1516 iter->SeekToFirst();
1517 ASSERT_TRUE(iter->Valid());
1518 ASSERT_EQ(Key(0), iter->key());
1519
1520 iter->Next();
1521 ASSERT_FALSE(iter->Valid());
1522
1523 delete iter;
1524 db_->ReleaseSnapshot(snapshot);
1525 }
1526
1527 TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeysInImmMemTables) {
1528 const int kFileBytes = 1 << 20;
1529
1530 Options options = CurrentOptions();
1531 options.compression = kNoCompression;
1532 options.disable_auto_compactions = true;
1533 options.target_file_size_base = kFileBytes;
1534 Reopen(options);
1535
1536 // block flush thread -> pin immtables in memory
1537 SyncPoint::GetInstance()->DisableProcessing();
1538 SyncPoint::GetInstance()->LoadDependency({
1539 {"SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator",
1540 "DBImpl::BGWorkFlush"},
1541 });
1542 SyncPoint::GetInstance()->EnableProcessing();
1543
1544 ASSERT_OK(Put(Key(0), "a"));
1545 std::unique_ptr<const Snapshot, std::function<void(const Snapshot*)>>
1546 snapshot(db_->GetSnapshot(),
1547 [this](const Snapshot* s) { db_->ReleaseSnapshot(s); });
1548
1549 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
1550 Key(10)));
1551
1552 ASSERT_OK(dbfull()->TEST_SwitchMemtable());
1553
1554 ReadOptions read_opts;
1555 read_opts.snapshot = snapshot.get();
1556 std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
1557 ASSERT_OK(iter->status());
1558
1559 TEST_SYNC_POINT("SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator");
1560
1561 iter->SeekToFirst();
1562 ASSERT_TRUE(iter->Valid());
1563 ASSERT_EQ(Key(0), iter->key());
1564
1565 iter->Next();
1566 ASSERT_FALSE(iter->Valid());
1567 }
1568
1569 TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
1570 // Adapted from
1571 // https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398.
1572 // Regression test for issue where range tombstone was written to more files
1573 // than necessary when it began exactly at the begin key in the next
1574 // compaction output file.
1575 const int kFileBytes = 1 << 20;
1576 const int kValueBytes = 4 << 10;
1577 Options options = CurrentOptions();
1578 options.compression = kNoCompression;
1579 options.disable_auto_compactions = true;
1580 // Have a bit of slack in the size limits but we enforce them more strictly
1581 // when manually flushing/compacting.
1582 options.max_compaction_bytes = 2 * kFileBytes;
1583 options.target_file_size_base = 2 * kFileBytes;
1584 options.write_buffer_size = 2 * kFileBytes;
1585 Reopen(options);
1586
1587 Random rnd(301);
1588 for (char first_char : {'a', 'b', 'c'}) {
1589 for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
1590 std::string key(1, first_char);
1591 key.append(Key(i));
1592 std::string value = rnd.RandomString(kValueBytes);
1593 ASSERT_OK(Put(key, value));
1594 }
1595 ASSERT_OK(db_->Flush(FlushOptions()));
1596 MoveFilesToLevel(2);
1597 }
1598 ASSERT_EQ(0, NumTableFilesAtLevel(0));
1599 ASSERT_EQ(3, NumTableFilesAtLevel(2));
1600
1601 // Populate the memtable lightly while spanning the whole key-space. The
1602 // setting of `max_compaction_bytes` will cause the L0->L1 to output multiple
1603 // files to prevent a large L1->L2 compaction later.
1604 ASSERT_OK(Put("a", "val"));
1605 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
1606 "c" + Key(1), "d"));
1607 // Our compaction output file cutting logic currently only considers point
1608 // keys. So, in order for the range tombstone to have a chance at landing at
1609 // the start of a new file, we need a point key at the range tombstone's
1610 // start.
1611 // TODO(ajkr): remove this `Put` after file cutting accounts for range
1612 // tombstones (#3977).
1613 ASSERT_OK(Put("c" + Key(1), "value"));
1614 ASSERT_OK(db_->Flush(FlushOptions()));
1615
1616 // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
1617 // and the range tombstone is only placed in the second SST.
1618 std::string begin_key_storage("c" + Key(1));
1619 Slice begin_key(begin_key_storage);
1620 std::string end_key_storage("d");
1621 Slice end_key(end_key_storage);
1622 ASSERT_OK(dbfull()->TEST_CompactRange(
1623 0 /* level */, &begin_key /* begin */, &end_key /* end */,
1624 nullptr /* column_family */, true /* disallow_trivial_move */));
1625 ASSERT_EQ(2, NumTableFilesAtLevel(1));
1626
1627 std::vector<LiveFileMetaData> all_metadata;
1628 std::vector<LiveFileMetaData> l1_metadata;
1629 db_->GetLiveFilesMetaData(&all_metadata);
1630 for (const auto& metadata : all_metadata) {
1631 if (metadata.level == 1) {
1632 l1_metadata.push_back(metadata);
1633 }
1634 }
1635 std::sort(l1_metadata.begin(), l1_metadata.end(),
1636 [&](const LiveFileMetaData& a, const LiveFileMetaData& b) {
1637 return options.comparator->Compare(a.smallestkey, b.smallestkey) <
1638 0;
1639 });
1640 ASSERT_EQ("a", l1_metadata[0].smallestkey);
1641 ASSERT_EQ("a", l1_metadata[0].largestkey);
1642 ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey);
1643 ASSERT_EQ("d", l1_metadata[1].largestkey);
1644
1645 TablePropertiesCollection all_table_props;
1646 ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props));
1647 int64_t num_range_deletions = 0;
1648 for (const auto& name_and_table_props : all_table_props) {
1649 const auto& name = name_and_table_props.first;
1650 const auto& table_props = name_and_table_props.second;
1651 // The range tombstone should only be output to the second L1 SST.
1652 if (name.size() >= l1_metadata[1].name.size() &&
1653 name.substr(name.size() - l1_metadata[1].name.size())
1654 .compare(l1_metadata[1].name) == 0) {
1655 ASSERT_EQ(1, table_props->num_range_deletions);
1656 ++num_range_deletions;
1657 } else {
1658 ASSERT_EQ(0, table_props->num_range_deletions);
1659 }
1660 }
1661 ASSERT_EQ(1, num_range_deletions);
1662 }
1663
1664 TEST_F(DBRangeDelTest, OverlappedTombstones) {
1665 const int kNumPerFile = 4, kNumFiles = 2;
1666 Options options = CurrentOptions();
1667 options.disable_auto_compactions = true;
1668 options.target_file_size_base = 9 * 1024;
1669 options.max_compaction_bytes = 9 * 1024;
1670 DestroyAndReopen(options);
1671 Random rnd(301);
1672 for (int i = 0; i < kNumFiles; ++i) {
1673 std::vector<std::string> values;
1674 // Write 12K (4 values, each 3K)
1675 for (int j = 0; j < kNumPerFile; j++) {
1676 values.push_back(rnd.RandomString(3 << 10));
1677 ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
1678 }
1679 }
1680 ASSERT_OK(db_->Flush(FlushOptions()));
1681 ASSERT_EQ(1, NumTableFilesAtLevel(0));
1682 MoveFilesToLevel(2);
1683 ASSERT_EQ(2, NumTableFilesAtLevel(2));
1684
1685 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
1686 Key((kNumFiles)*kNumPerFile + 1)));
1687 ASSERT_OK(db_->Flush(FlushOptions()));
1688
1689 ASSERT_EQ(1, NumTableFilesAtLevel(0));
1690
1691 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
1692 true /* disallow_trivial_move */));
1693
1694 // The tombstone range is not broken up into multiple SSTs which may incur a
1695 // large compaction with L2.
1696 ASSERT_EQ(1, NumTableFilesAtLevel(1));
1697 std::vector<std::vector<FileMetaData>> files;
1698 ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
1699 true /* disallow_trivial_move */));
1700 ASSERT_EQ(1, NumTableFilesAtLevel(2));
1701 ASSERT_EQ(0, NumTableFilesAtLevel(1));
1702 }
1703
1704 TEST_F(DBRangeDelTest, OverlappedKeys) {
1705 const int kNumPerFile = 4, kNumFiles = 2;
1706 Options options = CurrentOptions();
1707 options.disable_auto_compactions = true;
1708 options.target_file_size_base = 9 * 1024;
1709 options.max_compaction_bytes = 9 * 1024;
1710 DestroyAndReopen(options);
1711 Random rnd(301);
1712 for (int i = 0; i < kNumFiles; ++i) {
1713 std::vector<std::string> values;
1714 // Write 12K (4 values, each 3K)
1715 for (int j = 0; j < kNumPerFile; j++) {
1716 values.push_back(rnd.RandomString(3 << 10));
1717 ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
1718 }
1719 }
1720 ASSERT_OK(db_->Flush(FlushOptions()));
1721 ASSERT_EQ(1, NumTableFilesAtLevel(0));
1722 MoveFilesToLevel(2);
1723 ASSERT_EQ(2, NumTableFilesAtLevel(2));
1724
1725 for (int i = 1; i < kNumFiles * kNumPerFile + 1; i++) {
1726 ASSERT_OK(Put(Key(i), "0x123"));
1727 }
1728 ASSERT_OK(db_->Flush(FlushOptions()));
1729 ASSERT_EQ(1, NumTableFilesAtLevel(0));
1730
1731 // The key range is broken up into three SSTs to avoid a future big compaction
1732 // with the grandparent
1733 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
1734 true /* disallow_trivial_move */));
1735 ASSERT_EQ(3, NumTableFilesAtLevel(1));
1736
1737 ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
1738 true /* disallow_trivial_move */));
1739 // L1->L2 compaction size is limited to max_compaction_bytes
1740 ASSERT_EQ(3, NumTableFilesAtLevel(2));
1741 ASSERT_EQ(0, NumTableFilesAtLevel(1));
1742 }
1743
1744 TEST_F(DBRangeDelTest, IteratorRefresh) {
1745 // Refreshing an iterator after a range tombstone is added should cause the
1746 // deleted range of keys to disappear.
1747 for (bool sv_changed : {false, true}) {
1748 ASSERT_OK(db_->Put(WriteOptions(), "key1", "value1"));
1749 ASSERT_OK(db_->Put(WriteOptions(), "key2", "value2"));
1750
1751 auto* iter = db_->NewIterator(ReadOptions());
1752 ASSERT_OK(iter->status());
1753
1754 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
1755 "key2", "key3"));
1756
1757 if (sv_changed) {
1758 ASSERT_OK(db_->Flush(FlushOptions()));
1759 }
1760
1761 ASSERT_OK(iter->Refresh());
1762 ASSERT_OK(iter->status());
1763 iter->SeekToFirst();
1764 ASSERT_EQ("key1", iter->key());
1765 iter->Next();
1766 ASSERT_FALSE(iter->Valid());
1767
1768 delete iter;
1769 }
1770 }
1771
1772 void VerifyIteratorReachesEnd(InternalIterator* iter) {
1773 ASSERT_TRUE(!iter->Valid() && iter->status().ok());
1774 }
1775
1776 void VerifyIteratorReachesEnd(Iterator* iter) {
1777 ASSERT_TRUE(!iter->Valid() && iter->status().ok());
1778 }
1779
1780 TEST_F(DBRangeDelTest, IteratorReseek) {
1781 // Range tombstone triggers reseek (seeking to a range tombstone end key) in
1782 // merging iterator. Test set up:
1783 // one memtable: range tombstone [0, 1)
1784 // one immutable memtable: range tombstone [1, 2)
1785 // one L0 file with range tombstone [2, 3)
1786 // one L1 file with range tombstone [3, 4)
1787 // Seek(0) should trigger cascading reseeks at all levels below memtable.
1788 // Seek(1) should trigger cascading reseeks at all levels below immutable
1789 // memtable. SeekToFirst and SeekToLast trigger no reseek.
1790 Options options = CurrentOptions();
1791 options.compression = kNoCompression;
1792 options.disable_auto_compactions = true;
1793
1794 DestroyAndReopen(options);
1795 // L1
1796 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
1797 Key(4)));
1798 ASSERT_OK(db_->Flush(FlushOptions()));
1799 MoveFilesToLevel(1);
1800 ASSERT_EQ(1, NumTableFilesAtLevel(1));
1801 // L0
1802 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
1803 Key(3)));
1804 ASSERT_OK(db_->Flush(FlushOptions()));
1805 ASSERT_EQ(1, NumTableFilesAtLevel(0));
1806 // Immutable memtable
1807 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
1808 Key(2)));
1809 ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
1810 std::string value;
1811 ASSERT_TRUE(dbfull()->GetProperty(db_->DefaultColumnFamily(),
1812 "rocksdb.num-immutable-mem-table", &value));
1813 ASSERT_EQ(1, std::stoi(value));
1814 // live memtable
1815 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
1816 Key(1)));
1817 // this memtable is still active
1818 ASSERT_TRUE(dbfull()->GetProperty(db_->DefaultColumnFamily(),
1819 "rocksdb.num-immutable-mem-table", &value));
1820 ASSERT_EQ(1, std::stoi(value));
1821
1822 auto iter = db_->NewIterator(ReadOptions());
1823 get_perf_context()->Reset();
1824 iter->Seek(Key(0));
1825 // Reseeked immutable memtable, L0 and L1
1826 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 3);
1827 VerifyIteratorReachesEnd(iter);
1828 get_perf_context()->Reset();
1829 iter->SeekForPrev(Key(1));
1830 // Reseeked L0 and L1
1831 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
1832 VerifyIteratorReachesEnd(iter);
1833 get_perf_context()->Reset();
1834 iter->SeekToFirst();
1835 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
1836 VerifyIteratorReachesEnd(iter);
1837 iter->SeekToLast();
1838 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
1839 VerifyIteratorReachesEnd(iter);
1840 delete iter;
1841 }
1842
1843 TEST_F(DBRangeDelTest, ReseekDuringNextAndPrev) {
1844 // Range tombstone triggers reseek during Next()/Prev() in merging iterator.
1845 // Test set up:
1846 // memtable has: [0, 1) [2, 3)
1847 // L0 has: 2
1848 // L1 has: 1, 2, 3
1849 // Seek(0) will reseek to 1 for L0 and L1. Seek(1) will not trigger any
1850 // reseek. Then Next() determines 2 is covered by [2, 3), it will try to
1851 // reseek to 3 for L0 and L1. Similar story for Prev() and SeekForPrev() is
1852 // tested.
1853 Options options = CurrentOptions();
1854 options.compression = kNoCompression;
1855 options.disable_auto_compactions = true;
1856
1857 DestroyAndReopen(options);
1858 // L1
1859 ASSERT_OK(db_->Put(WriteOptions(), Key(1), "foo"));
1860 ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
1861 ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
1862 ASSERT_OK(db_->Flush(FlushOptions()));
1863 MoveFilesToLevel(1);
1864 ASSERT_EQ(1, NumTableFilesAtLevel(1));
1865
1866 // L0
1867 ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
1868 ASSERT_OK(db_->Flush(FlushOptions()));
1869 ASSERT_EQ(1, NumTableFilesAtLevel(0));
1870
1871 // Memtable
1872 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
1873 Key(1)));
1874 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
1875 Key(3)));
1876
1877 auto iter = db_->NewIterator(ReadOptions());
1878 auto iter_test_forward = [&] {
1879 ASSERT_TRUE(iter->Valid());
1880 ASSERT_EQ(iter->key().ToString(), Key(1));
1881
1882 get_perf_context()->Reset();
1883 iter->Next();
1884 ASSERT_TRUE(iter->Valid());
1885 ASSERT_EQ(iter->key().ToString(), Key(3));
1886 // Reseeked L0 and L1
1887 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
1888
1889 // Next to Prev
1890 get_perf_context()->Reset();
1891 iter->Prev();
1892 ASSERT_TRUE(iter->Valid());
1893 ASSERT_EQ(iter->key().ToString(), Key(1));
1894 // Reseeked L0 and L1
1895 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
1896
1897 // Prev to Next
1898 get_perf_context()->Reset();
1899 iter->Next();
1900 ASSERT_TRUE(iter->Valid());
1901 ASSERT_EQ(iter->key().ToString(), Key(3));
1902 // Reseeked L0 and L1
1903 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
1904
1905 iter->Next();
1906 VerifyIteratorReachesEnd(iter);
1907 };
1908
1909 get_perf_context()->Reset();
1910 iter->Seek(Key(0));
1911 // Reseeked L0 and L1
1912 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
1913 iter_test_forward();
1914 get_perf_context()->Reset();
1915 iter->Seek(Key(1));
1916 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
1917 iter_test_forward();
1918
1919 get_perf_context()->Reset();
1920 iter->SeekForPrev(Key(2));
1921 // Reseeked L0 and L1
1922 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
1923 iter_test_forward();
1924 get_perf_context()->Reset();
1925 iter->SeekForPrev(Key(1));
1926 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
1927 iter_test_forward();
1928
1929 get_perf_context()->Reset();
1930 iter->SeekToFirst();
1931 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
1932 iter_test_forward();
1933
1934 iter->SeekToLast();
1935 iter->Prev();
1936 iter_test_forward();
1937 delete iter;
1938 }
1939
1940 TEST_F(DBRangeDelTest, TombstoneFromCurrentLevel) {
1941 // Range tombstone triggers reseek when covering key from the same level.
1942 // in merging iterator. Test set up:
1943 // memtable has: [0, 1)
1944 // L0 has: [2, 3), 2
1945 // L1 has: 1, 2, 3
1946 // Seek(0) will reseek to 1 for L0 and L1.
1947 // Then Next() will reseek to 3 for L1 since 2 in L0 is covered by [2, 3) in
1948 // L0.
1949 Options options = CurrentOptions();
1950 options.compression = kNoCompression;
1951 options.disable_auto_compactions = true;
1952
1953 DestroyAndReopen(options);
1954 // L1
1955 ASSERT_OK(db_->Put(WriteOptions(), Key(1), "foo"));
1956 ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
1957 ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
1958 ASSERT_OK(db_->Flush(FlushOptions()));
1959 MoveFilesToLevel(1);
1960 ASSERT_EQ(1, NumTableFilesAtLevel(1));
1961
1962 // L0
1963 ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
1964 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
1965 Key(3)));
1966 ASSERT_OK(db_->Flush(FlushOptions()));
1967 ASSERT_EQ(1, NumTableFilesAtLevel(0));
1968
1969 // Memtable
1970 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
1971 Key(1)));
1972
1973 auto iter = db_->NewIterator(ReadOptions());
1974 get_perf_context()->Reset();
1975 iter->Seek(Key(0));
1976 ASSERT_TRUE(iter->Valid());
1977 ASSERT_EQ(iter->key().ToString(), Key(1));
1978 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
1979
1980 get_perf_context()->Reset();
1981 iter->Next();
1982 ASSERT_TRUE(iter->Valid());
1983 ASSERT_EQ(iter->key().ToString(), Key(3));
1984 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
1985
1986 delete iter;
1987 }
1988
1989 class TombstoneTestSstPartitioner : public SstPartitioner {
1990 public:
1991 const char* Name() const override { return "SingleKeySstPartitioner"; }
1992
1993 PartitionerResult ShouldPartition(
1994 const PartitionerRequest& request) override {
1995 if (cmp->Compare(*request.current_user_key, DBTestBase::Key(5)) == 0) {
1996 return kRequired;
1997 } else {
1998 return kNotRequired;
1999 }
2000 }
2001
2002 bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
2003 const Slice& /*largest_user_key*/) override {
2004 return false;
2005 }
2006
2007 const Comparator* cmp = BytewiseComparator();
2008 };
2009
2010 class TombstoneTestSstPartitionerFactory : public SstPartitionerFactory {
2011 public:
2012 static const char* kClassName() {
2013 return "TombstoneTestSstPartitionerFactory";
2014 }
2015 const char* Name() const override { return kClassName(); }
2016
2017 std::unique_ptr<SstPartitioner> CreatePartitioner(
2018 const SstPartitioner::Context& /* context */) const override {
2019 return std::unique_ptr<SstPartitioner>(new TombstoneTestSstPartitioner());
2020 }
2021 };
2022
2023 TEST_F(DBRangeDelTest, TombstoneAcrossFileBoundary) {
2024 // Verify that a range tombstone across file boundary covers keys from older
2025 // levels. Test set up:
2026 // L1_0: 1, 3, [2, 6) L1_1: 5, 7, [2, 6) ([2, 6) is from compaction with
2027 // L1_0) L2 has: 5
2028 // Seek(1) and then Next() should move the L1 level iterator to
2029 // L1_1. Check if 5 is returned after Next().
2030 Options options = CurrentOptions();
2031 options.compression = kNoCompression;
2032 options.disable_auto_compactions = true;
2033 options.target_file_size_base = 2 * 1024;
2034 options.max_compaction_bytes = 2 * 1024;
2035
2036 // Make sure L1 files are split before "5"
2037 auto factory = std::make_shared<TombstoneTestSstPartitionerFactory>();
2038 options.sst_partitioner_factory = factory;
2039
2040 DestroyAndReopen(options);
2041
2042 Random rnd(301);
2043 // L2
2044 // the file should be smaller than max_compaction_bytes, otherwise the file
2045 // will be cut before 7.
2046 ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(1 << 9)));
2047 ASSERT_OK(db_->Flush(FlushOptions()));
2048 MoveFilesToLevel(2);
2049 ASSERT_EQ(1, NumTableFilesAtLevel(2));
2050
2051 // L1_1
2052 ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(1 << 10)));
2053 ASSERT_OK(db_->Put(WriteOptions(), Key(7), rnd.RandomString(1 << 10)));
2054 ASSERT_OK(db_->Flush(FlushOptions()));
2055 ASSERT_EQ(1, NumTableFilesAtLevel(0));
2056
2057 // L1_0
2058 ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(1 << 10)));
2059 ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(1 << 10)));
2060 // Prevent keys being compacted away
2061 const Snapshot* snapshot = db_->GetSnapshot();
2062 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
2063 Key(6)));
2064 ASSERT_OK(db_->Flush(FlushOptions()));
2065 ASSERT_EQ(2, NumTableFilesAtLevel(0));
2066 MoveFilesToLevel(1);
2067 ASSERT_EQ(2, NumTableFilesAtLevel(1));
2068
2069 auto iter = db_->NewIterator(ReadOptions());
2070 get_perf_context()->Reset();
2071 iter->Seek(Key(1));
2072 ASSERT_TRUE(iter->Valid());
2073 ASSERT_EQ(iter->key().ToString(), Key(1));
2074 iter->Next();
2075 ASSERT_TRUE(iter->Valid());
2076 ASSERT_EQ(iter->key().ToString(), Key(7));
2077 // 1 reseek into L2 when key 5 in L2 is covered by [2, 6) from L1
2078 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
2079
2080 delete iter;
2081 db_->ReleaseSnapshot(snapshot);
2082 }
2083
2084 TEST_F(DBRangeDelTest, NonOverlappingTombstonAtBoundary) {
2085 // Verify that a range tombstone across file boundary covers keys from older
2086 // levels.
2087 // Test set up:
2088 // L1_0: 1, 3, [4, 7) L1_1: 6, 8, [4, 7)
2089 // L2: 5
2090 // Note that [4, 7) is at end of L1_0 and not overlapping with any point key
2091 // in L1_0. [4, 7) from L1_0 should cover 5 is sentinel works
2092 Options options = CurrentOptions();
2093 options.compression = kNoCompression;
2094 options.disable_auto_compactions = true;
2095 options.target_file_size_base = 2 * 1024;
2096 DestroyAndReopen(options);
2097
2098 Random rnd(301);
2099 // L2
2100 ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10)));
2101 ASSERT_OK(db_->Flush(FlushOptions()));
2102 MoveFilesToLevel(2);
2103 ASSERT_EQ(1, NumTableFilesAtLevel(2));
2104
2105 // L1_1
2106 ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10)));
2107 ASSERT_OK(db_->Put(WriteOptions(), Key(8), rnd.RandomString(4 << 10)));
2108 ASSERT_OK(db_->Flush(FlushOptions()));
2109 ASSERT_EQ(1, NumTableFilesAtLevel(0));
2110
2111 // L1_0
2112 ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
2113 ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(4 << 10)));
2114 // Prevent keys being compacted away
2115 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4),
2116 Key(7)));
2117 ASSERT_OK(db_->Flush(FlushOptions()));
2118 ASSERT_EQ(2, NumTableFilesAtLevel(0));
2119 MoveFilesToLevel(1);
2120 ASSERT_EQ(2, NumTableFilesAtLevel(1));
2121
2122 auto iter = db_->NewIterator(ReadOptions());
2123 iter->Seek(Key(3));
2124 ASSERT_TRUE(iter->Valid());
2125 ASSERT_EQ(iter->key(), Key(3));
2126 get_perf_context()->Reset();
2127 iter->Next();
2128 ASSERT_TRUE(iter->Valid());
2129 ASSERT_EQ(iter->key().ToString(), Key(8));
2130 // 1 reseek into L1 since 5 from L2 is covered by [4, 7) from L1
2131 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
2132 for (auto& k : {4, 5, 6}) {
2133 get_perf_context()->Reset();
2134 iter->Seek(Key(k));
2135 ASSERT_TRUE(iter->Valid());
2136 ASSERT_EQ(iter->key().ToString(), Key(8));
2137 // 1 reseek into L1
2138 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
2139 }
2140 delete iter;
2141 }
2142
2143 TEST_F(DBRangeDelTest, OlderLevelHasNewerData) {
2144 // L1_0: 1, 3, [2, 7) L1_1: 5, 6 at a newer sequence number than [2, 7)
2145 // Compact L1_1 to L2. Seek(3) should not skip 5 or 6.
2146 Options options = CurrentOptions();
2147 options.compression = kNoCompression;
2148 options.disable_auto_compactions = true;
2149 options.target_file_size_base = 3 * 1024;
2150 DestroyAndReopen(options);
2151
2152 Random rnd(301);
2153 // L1_0
2154 ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
2155 ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(4 << 10)));
2156 const Snapshot* snapshot = db_->GetSnapshot();
2157 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
2158 Key(7)));
2159 ASSERT_OK(db_->Flush(FlushOptions()));
2160 MoveFilesToLevel(1);
2161 ASSERT_EQ(1, NumTableFilesAtLevel(1));
2162
2163 // L1_1
2164 ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10)));
2165 ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10)));
2166 ASSERT_OK(db_->Flush(FlushOptions()));
2167 ASSERT_EQ(1, NumTableFilesAtLevel(0));
2168 MoveFilesToLevel(1);
2169 ASSERT_EQ(2, NumTableFilesAtLevel(1));
2170
2171 auto key = Key(6);
2172 Slice begin(key);
2173 EXPECT_OK(dbfull()->TEST_CompactRange(1, &begin, nullptr));
2174 ASSERT_EQ(1, NumTableFilesAtLevel(1));
2175 ASSERT_EQ(1, NumTableFilesAtLevel(2));
2176
2177 auto iter = db_->NewIterator(ReadOptions());
2178 iter->Seek(Key(3));
2179 ASSERT_TRUE(iter->Valid());
2180 ASSERT_EQ(iter->key().ToString(), Key(5));
2181 iter->Next();
2182 ASSERT_TRUE(iter->Valid());
2183 ASSERT_EQ(iter->key().ToString(), Key(6));
2184 delete iter;
2185 db_->ReleaseSnapshot(snapshot);
2186 }
2187
2188 TEST_F(DBRangeDelTest, LevelBoundaryDefinedByTombstone) {
2189 // L1 has: 1, 2, [4, 5)
2190 // L2 has: 4
2191 // Seek(3), which is over all points keys in L1, check whether
2192 // sentinel key from L1 works in this case.
2193 Options options = CurrentOptions();
2194 options.compression = kNoCompression;
2195 options.disable_auto_compactions = true;
2196 options.target_file_size_base = 3 * 1024;
2197 DestroyAndReopen(options);
2198 Random rnd(301);
2199 // L2
2200 ASSERT_OK(db_->Put(WriteOptions(), Key(4), "foo"));
2201 ASSERT_OK(db_->Flush(FlushOptions()));
2202 const Snapshot* snapshot = db_->GetSnapshot();
2203 MoveFilesToLevel(2);
2204 ASSERT_EQ(1, NumTableFilesAtLevel(2));
2205
2206 // L1_0
2207 ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
2208 ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
2209 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4),
2210 Key(5)));
2211 ASSERT_OK(db_->Flush(FlushOptions()));
2212 MoveFilesToLevel(1);
2213 ASSERT_EQ(1, NumTableFilesAtLevel(1));
2214 ASSERT_EQ(1, NumTableFilesAtLevel(2));
2215
2216 auto iter = db_->NewIterator(ReadOptions());
2217 iter->Seek(Key(3));
2218 ASSERT_TRUE(!iter->Valid());
2219 ASSERT_OK(iter->status());
2220
2221 get_perf_context()->Reset();
2222 iter->SeekForPrev(Key(5));
2223 ASSERT_TRUE(iter->Valid());
2224 ASSERT_EQ(iter->key(), Key(2));
2225 db_->ReleaseSnapshot(snapshot);
2226 delete iter;
2227 }
2228
2229 TEST_F(DBRangeDelTest, TombstoneOnlyFile) {
2230 // L1_0: 1, 2, L1_1: [3, 5)
2231 // L2: 3
2232 // Seek(2) then Next() should advance L1 iterator into L1_1.
2233 // If sentinel works with tombstone only file, it should cover the key in L2.
2234 // Similar story for SeekForPrev(4).
2235 Options options = CurrentOptions();
2236 options.compression = kNoCompression;
2237 options.disable_auto_compactions = true;
2238 options.target_file_size_base = 3 * 1024;
2239
2240 DestroyAndReopen(options);
2241 Random rnd(301);
2242 // L2
2243 ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
2244 ASSERT_OK(db_->Flush(FlushOptions()));
2245 MoveFilesToLevel(2);
2246 ASSERT_EQ(1, NumTableFilesAtLevel(2));
2247
2248 // L1_0
2249 ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
2250 ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
2251 ASSERT_OK(db_->Flush(FlushOptions()));
2252 MoveFilesToLevel(1);
2253 ASSERT_EQ(1, NumTableFilesAtLevel(1));
2254 ASSERT_EQ(1, NumTableFilesAtLevel(2));
2255
2256 // L1_1
2257 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
2258 Key(5)));
2259 ASSERT_OK(db_->Flush(FlushOptions()));
2260 MoveFilesToLevel(1);
2261 ASSERT_EQ(2, NumTableFilesAtLevel(1));
2262 ASSERT_EQ(1, NumTableFilesAtLevel(2));
2263
2264 auto iter = db_->NewIterator(ReadOptions());
2265 iter->Seek(Key(2));
2266 ASSERT_TRUE(iter->Valid());
2267 ASSERT_EQ(iter->key(), Key(2));
2268 iter->Next();
2269 VerifyIteratorReachesEnd(iter);
2270 iter->SeekForPrev(Key(4));
2271 ASSERT_TRUE(iter->Valid());
2272 ASSERT_EQ(iter->key(), Key(2));
2273 iter->Next();
2274 VerifyIteratorReachesEnd(iter);
2275 delete iter;
2276 }
2277
2278 void VerifyIteratorKey(InternalIterator* iter,
2279 const std::vector<std::string>& expected_keys,
2280 bool forward = true) {
2281 for (auto& key : expected_keys) {
2282 ASSERT_TRUE(iter->Valid());
2283 ASSERT_EQ(iter->user_key(), key);
2284 if (forward) {
2285 iter->Next();
2286 } else {
2287 iter->Prev();
2288 }
2289 }
2290 }
2291
2292 TEST_F(DBRangeDelTest, TombstoneOnlyLevel) {
2293 // L1 [3, 5)
2294 // L2 has: 3, 4
2295 // Any kind of iterator seek should skip 3 and 4 in L2.
2296 // L1 level iterator should produce sentinel key.
2297 Options options = CurrentOptions();
2298 options.compression = kNoCompression;
2299 options.disable_auto_compactions = true;
2300 options.target_file_size_base = 3 * 1024;
2301
2302 DestroyAndReopen(options);
2303 // L2
2304 ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
2305 ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar"));
2306 ASSERT_OK(db_->Flush(FlushOptions()));
2307 MoveFilesToLevel(2);
2308 ASSERT_EQ(1, NumTableFilesAtLevel(2));
2309
2310 // L1
2311 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
2312 Key(5)));
2313 ASSERT_OK(db_->Flush(FlushOptions()));
2314 MoveFilesToLevel(1);
2315 ASSERT_EQ(1, NumTableFilesAtLevel(1));
2316
2317 auto iter = db_->NewIterator(ReadOptions());
2318 get_perf_context()->Reset();
2319 uint64_t expected_reseek = 0;
2320 for (auto i = 0; i < 7; ++i) {
2321 iter->Seek(Key(i));
2322 VerifyIteratorReachesEnd(iter);
2323 if (i < 5) {
2324 ++expected_reseek;
2325 }
2326 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
2327 expected_reseek);
2328 iter->SeekForPrev(Key(i));
2329 VerifyIteratorReachesEnd(iter);
2330 if (i > 2) {
2331 ++expected_reseek;
2332 }
2333 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
2334 expected_reseek);
2335 iter->SeekToFirst();
2336 VerifyIteratorReachesEnd(iter);
2337 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
2338 ++expected_reseek);
2339 iter->SeekToLast();
2340 VerifyIteratorReachesEnd(iter);
2341 ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
2342 ++expected_reseek);
2343 }
2344 delete iter;
2345
2346 // Check L1 LevelIterator behavior
2347 ColumnFamilyData* cfd =
2348 static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
2349 ->cfd();
2350 SuperVersion* sv = cfd->GetSuperVersion();
2351 Arena arena;
2352 ReadOptions read_options;
2353 MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), &arena,
2354 false /* prefix seek */);
2355 InternalIterator* level_iter = sv->current->TEST_GetLevelIterator(
2356 read_options, &merge_iter_builder, 1 /* level */, true);
2357 // This is needed to make LevelIterator range tombstone aware
2358 auto miter = merge_iter_builder.Finish();
2359 auto k = Key(3);
2360 IterKey target;
2361 target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek);
2362 level_iter->Seek(target.GetInternalKey());
2363 // sentinel key (file boundary as a fake key)
2364 VerifyIteratorKey(level_iter, {Key(5)});
2365 VerifyIteratorReachesEnd(level_iter);
2366
2367 k = Key(5);
2368 target.SetInternalKey(k, 0, kValueTypeForSeekForPrev);
2369 level_iter->SeekForPrev(target.GetInternalKey());
2370 VerifyIteratorKey(level_iter, {Key(3)}, false);
2371 VerifyIteratorReachesEnd(level_iter);
2372
2373 level_iter->SeekToFirst();
2374 VerifyIteratorKey(level_iter, {Key(5)});
2375 VerifyIteratorReachesEnd(level_iter);
2376
2377 level_iter->SeekToLast();
2378 VerifyIteratorKey(level_iter, {Key(3)}, false);
2379 VerifyIteratorReachesEnd(level_iter);
2380
2381 miter->~InternalIterator();
2382 }
2383
2384 TEST_F(DBRangeDelTest, TombstoneOnlyWithOlderVisibleKey) {
2385 // L1: [3, 5)
2386 // L2: 2, 4, 5
2387 // 2 and 5 should be visible
2388 Options options = CurrentOptions();
2389 options.compression = kNoCompression;
2390 options.disable_auto_compactions = true;
2391 options.target_file_size_base = 3 * 1024;
2392
2393 DestroyAndReopen(options);
2394 // L2
2395 ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
2396 ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar"));
2397 ASSERT_OK(db_->Put(WriteOptions(), Key(5), "foobar"));
2398 ASSERT_OK(db_->Flush(FlushOptions()));
2399 MoveFilesToLevel(2);
2400 ASSERT_EQ(1, NumTableFilesAtLevel(2));
2401
2402 // l1
2403 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
2404 Key(5)));
2405 ASSERT_OK(db_->Flush(FlushOptions()));
2406 MoveFilesToLevel(1);
2407 ASSERT_EQ(1, NumTableFilesAtLevel(1));
2408
2409 auto iter = db_->NewIterator(ReadOptions());
2410 auto iter_test_backward = [&] {
2411 ASSERT_TRUE(iter->Valid());
2412 ASSERT_EQ(iter->key(), Key(5));
2413 iter->Prev();
2414 ASSERT_TRUE(iter->Valid());
2415 ASSERT_EQ(iter->key(), Key(2));
2416 iter->Prev();
2417 VerifyIteratorReachesEnd(iter);
2418 };
2419 auto iter_test_forward = [&] {
2420 ASSERT_TRUE(iter->Valid());
2421 ASSERT_EQ(iter->key(), Key(2));
2422 iter->Next();
2423 ASSERT_TRUE(iter->Valid());
2424 ASSERT_EQ(iter->key(), Key(5));
2425 iter->Next();
2426 VerifyIteratorReachesEnd(iter);
2427 };
2428 iter->Seek(Key(4));
2429 iter_test_backward();
2430 iter->SeekForPrev(Key(4));
2431 iter->Next();
2432 iter_test_backward();
2433
2434 iter->Seek(Key(4));
2435 iter->Prev();
2436 iter_test_forward();
2437 iter->SeekForPrev(Key(4));
2438 iter_test_forward();
2439
2440 iter->SeekToFirst();
2441 iter_test_forward();
2442 iter->SeekToLast();
2443 iter_test_backward();
2444
2445 delete iter;
2446 }
2447
2448 TEST_F(DBRangeDelTest, TombstoneSentinelDirectionChange) {
2449 // L1: 7
2450 // L2: [4, 6)
2451 // L3: 4
2452 // Seek(5) will have sentinel key 6 at the top of minHeap in merging iterator.
2453 // then do a prev, how would sentinel work?
2454 // Redo the test after Put(5) into L1 so that there is a visible key in range
2455 // [4, 6).
2456 Options options = CurrentOptions();
2457 options.compression = kNoCompression;
2458 options.disable_auto_compactions = true;
2459 options.target_file_size_base = 3 * 1024;
2460
2461 DestroyAndReopen(options);
2462 // L3
2463 ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar"));
2464 ASSERT_OK(db_->Flush(FlushOptions()));
2465 MoveFilesToLevel(3);
2466 ASSERT_EQ(1, NumTableFilesAtLevel(3));
2467 // L2
2468 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4),
2469 Key(6)));
2470 ASSERT_OK(db_->Flush(FlushOptions()));
2471 MoveFilesToLevel(2);
2472 ASSERT_EQ(1, NumTableFilesAtLevel(2));
2473
2474 // L1
2475 ASSERT_OK(db_->Put(WriteOptions(), Key(7), "foobar"));
2476 ASSERT_OK(db_->Flush(FlushOptions()));
2477 MoveFilesToLevel(1);
2478 ASSERT_EQ(1, NumTableFilesAtLevel(1));
2479
2480 auto iter = db_->NewIterator(ReadOptions());
2481 iter->Seek(Key(5));
2482 ASSERT_TRUE(iter->Valid());
2483 ASSERT_EQ(iter->key(), Key(7));
2484 iter->Prev();
2485 ASSERT_TRUE(!iter->Valid() && iter->status().ok());
2486 delete iter;
2487
2488 ASSERT_OK(db_->Put(WriteOptions(), Key(5), "foobar"));
2489 ASSERT_OK(db_->Flush(FlushOptions()));
2490 MoveFilesToLevel(1);
2491 ASSERT_EQ(2, NumTableFilesAtLevel(1));
2492
2493 iter = db_->NewIterator(ReadOptions());
2494 iter->Seek(Key(5));
2495 ASSERT_TRUE(iter->Valid());
2496 ASSERT_EQ(iter->key(), Key(5));
2497 iter->Prev();
2498 ASSERT_TRUE(!iter->Valid() && iter->status().ok());
2499 delete iter;
2500 }
2501
2502 // Right sentinel tested in many test cases above
2503 TEST_F(DBRangeDelTest, LeftSentinelKeyTest) {
2504 // L1_0: 0, 1 L1_1: [2, 3), 5
2505 // L2: 2
2506 // SeekForPrev(4) should give 1 due to sentinel key keeping [2, 3) alive.
2507 Options options = CurrentOptions();
2508 options.compression = kNoCompression;
2509 options.disable_auto_compactions = true;
2510 options.target_file_size_base = 3 * 1024;
2511 options.max_compaction_bytes = 1024;
2512
2513 DestroyAndReopen(options);
2514 // L2
2515 ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
2516 ASSERT_OK(db_->Flush(FlushOptions()));
2517 MoveFilesToLevel(2);
2518 ASSERT_EQ(1, NumTableFilesAtLevel(2));
2519
2520 // L1_0
2521 Random rnd(301);
2522 ASSERT_OK(db_->Put(WriteOptions(), Key(0), rnd.RandomString(4 << 10)));
2523 ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
2524 ASSERT_OK(db_->Flush(FlushOptions()));
2525 MoveFilesToLevel(1);
2526 ASSERT_EQ(1, NumTableFilesAtLevel(1));
2527
2528 // L1_1
2529 ASSERT_OK(db_->Put(WriteOptions(), Key(5), "bar"));
2530 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
2531 Key(3)));
2532 ASSERT_OK(db_->Flush(FlushOptions()));
2533 MoveFilesToLevel(1);
2534 ASSERT_EQ(2, NumTableFilesAtLevel(1));
2535
2536 auto iter = db_->NewIterator(ReadOptions());
2537 iter->SeekForPrev(Key(4));
2538 ASSERT_TRUE(iter->Valid());
2539 ASSERT_EQ(iter->key(), Key(1));
2540 iter->Prev();
2541 ASSERT_TRUE(iter->Valid());
2542 ASSERT_EQ(iter->key(), Key(0));
2543 iter->Prev();
2544 ASSERT_TRUE(!iter->Valid());
2545 ASSERT_OK(iter->status());
2546 delete iter;
2547 }
2548
2549 TEST_F(DBRangeDelTest, LeftSentinelKeyTestWithNewerKey) {
2550 // L1_0: 1, 2 newer than L1_1, L1_1: [2, 4), 5
2551 // L2: 3
2552 // SeekForPrev(4) then Prev() should give 2 and then 1.
2553 Options options = CurrentOptions();
2554 options.compression = kNoCompression;
2555 options.disable_auto_compactions = true;
2556 options.target_file_size_base = 3 * 1024;
2557 options.max_compaction_bytes = 1024;
2558
2559 DestroyAndReopen(options);
2560 // L2
2561 ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
2562 ASSERT_OK(db_->Flush(FlushOptions()));
2563 MoveFilesToLevel(2);
2564 ASSERT_EQ(1, NumTableFilesAtLevel(2));
2565
2566 // L1_1
2567 ASSERT_OK(db_->Put(WriteOptions(), Key(5), "bar"));
2568 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
2569 Key(4)));
2570 ASSERT_OK(db_->Flush(FlushOptions()));
2571 MoveFilesToLevel(1);
2572 ASSERT_EQ(1, NumTableFilesAtLevel(1));
2573
2574 // L1_0
2575 Random rnd(301);
2576 ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
2577 ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
2578 // Used to verify sequence number of iterator key later.
2579 auto seq = dbfull()->TEST_GetLastVisibleSequence();
2580 ASSERT_OK(db_->Flush(FlushOptions()));
2581 MoveFilesToLevel(1);
2582 ASSERT_EQ(2, NumTableFilesAtLevel(1));
2583
2584 Arena arena;
2585 InternalKeyComparator icmp(options.comparator);
2586 ReadOptions read_options;
2587 ScopedArenaIterator iter;
2588 iter.set(
2589 dbfull()->NewInternalIterator(read_options, &arena, kMaxSequenceNumber));
2590
2591 auto k = Key(4);
2592 IterKey target;
2593 target.SetInternalKey(k, 0 /* sequence_number */, kValueTypeForSeekForPrev);
2594 iter->SeekForPrev(target.GetInternalKey());
2595 ASSERT_TRUE(iter->Valid());
2596 ASSERT_EQ(iter->user_key(), Key(2));
2597 SequenceNumber actual_seq;
2598 ValueType type;
2599 UnPackSequenceAndType(ExtractInternalKeyFooter(iter->key()), &actual_seq,
2600 &type);
2601 ASSERT_EQ(seq, actual_seq);
2602 // might as well check type
2603 ASSERT_EQ(type, kTypeValue);
2604
2605 iter->Prev();
2606 ASSERT_TRUE(iter->Valid());
2607 ASSERT_EQ(iter->user_key(), Key(1));
2608 iter->Prev();
2609 ASSERT_TRUE(!iter->Valid());
2610 ASSERT_OK(iter->status());
2611 }
2612
2613 TEST_F(DBRangeDelTest, SentinelKeyCommonCaseTest) {
2614 // L1 has 3 files
2615 // L1_0: 1, 2 L1_1: [3, 4) 5, 6, [7, 8) L1_2: 9
2616 // Check iterator operations on LevelIterator.
2617 Options options = CurrentOptions();
2618 options.compression = kNoCompression;
2619 options.disable_auto_compactions = true;
2620 options.target_file_size_base = 3 * 1024;
2621
2622 DestroyAndReopen(options);
2623 Random rnd(301);
2624 // L1_0
2625 ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
2626 ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
2627 ASSERT_OK(db_->Flush(FlushOptions()));
2628 MoveFilesToLevel(1);
2629 ASSERT_EQ(1, NumTableFilesAtLevel(1));
2630
2631 // L1_1
2632 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
2633 Key(4)));
2634 ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10)));
2635 ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10)));
2636 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(7),
2637 Key(8)));
2638 ASSERT_OK(db_->Flush(FlushOptions()));
2639 MoveFilesToLevel(1);
2640 ASSERT_EQ(2, NumTableFilesAtLevel(1));
2641
2642 // L1_2
2643 ASSERT_OK(db_->Put(WriteOptions(), Key(9), rnd.RandomString(4 << 10)));
2644 ASSERT_OK(db_->Flush(FlushOptions()));
2645 MoveFilesToLevel(1);
2646 ASSERT_EQ(3, NumTableFilesAtLevel(1));
2647
2648 ColumnFamilyData* cfd =
2649 static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
2650 ->cfd();
2651 SuperVersion* sv = cfd->GetSuperVersion();
2652 Arena arena;
2653 ReadOptions read_options;
2654 MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), &arena,
2655 false /* prefix seek */);
2656 InternalIterator* level_iter = sv->current->TEST_GetLevelIterator(
2657 read_options, &merge_iter_builder, 1 /* level */, true);
2658 // This is needed to make LevelIterator range tombstone aware
2659 auto miter = merge_iter_builder.Finish();
2660 auto k = Key(7);
2661 IterKey target;
2662 target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek);
2663 level_iter->Seek(target.GetInternalKey());
2664 // The last Key(9) is a sentinel key.
2665 VerifyIteratorKey(level_iter, {Key(8), Key(9), Key(9)});
2666 ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
2667
2668 k = Key(6);
2669 target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek);
2670 level_iter->Seek(target.GetInternalKey());
2671 VerifyIteratorKey(level_iter, {Key(6), Key(8), Key(9), Key(9)});
2672 ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
2673
2674 k = Key(4);
2675 target.SetInternalKey(k, 0, kValueTypeForSeekForPrev);
2676 level_iter->SeekForPrev(target.GetInternalKey());
2677 VerifyIteratorKey(level_iter, {Key(3), Key(2), Key(1), Key(1)}, false);
2678 ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
2679
2680 k = Key(5);
2681 target.SetInternalKey(k, 0, kValueTypeForSeekForPrev);
2682 level_iter->SeekForPrev(target.GetInternalKey());
2683 VerifyIteratorKey(level_iter, {Key(5), Key(3), Key(2), Key(1), Key(1)},
2684 false);
2685
2686 level_iter->SeekToFirst();
2687 VerifyIteratorKey(level_iter, {Key(1), Key(2), Key(2), Key(5), Key(6), Key(8),
2688 Key(9), Key(9)});
2689 ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
2690
2691 level_iter->SeekToLast();
2692 VerifyIteratorKey(
2693 level_iter,
2694 {Key(9), Key(9), Key(6), Key(5), Key(3), Key(2), Key(1), Key(1)}, false);
2695 ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
2696
2697 miter->~InternalIterator();
2698 }
2699
2700 TEST_F(DBRangeDelTest, PrefixSentinelKey) {
2701 // L1: ['aaaa', 'aaad'), 'bbbb'
2702 // L2: 'aaac', 'aaae'
2703 // Prefix extracts first 3 chars
2704 // Seek('aaab') should give 'aaae' as first key.
2705 // This is to test a previous bug where prefix seek sees there is no prefix in
2706 // the SST file, and will just set file iter to null in LevelIterator and may
2707 // just skip to the next SST file. But in this case, we should keep the file's
2708 // tombstone alive.
2709 Options options = CurrentOptions();
2710 options.compression = kNoCompression;
2711 options.disable_auto_compactions = true;
2712 options.prefix_extractor.reset(NewFixedPrefixTransform(3));
2713 BlockBasedTableOptions table_options;
2714 table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
2715 table_options.whole_key_filtering = false;
2716 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
2717 DestroyAndReopen(options);
2718 Random rnd(301);
2719
2720 // L2:
2721 ASSERT_OK(db_->Put(WriteOptions(), "aaac", rnd.RandomString(10)));
2722 ASSERT_OK(db_->Put(WriteOptions(), "aaae", rnd.RandomString(10)));
2723 ASSERT_OK(db_->Flush(FlushOptions()));
2724 MoveFilesToLevel(2);
2725 ASSERT_EQ(1, NumTableFilesAtLevel(2));
2726
2727 // L1
2728 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "aaaa",
2729 "aaad"));
2730 ASSERT_OK(db_->Put(WriteOptions(), "bbbb", rnd.RandomString(10)));
2731 ASSERT_OK(db_->Flush(FlushOptions()));
2732 MoveFilesToLevel(1);
2733 ASSERT_EQ(1, NumTableFilesAtLevel(1));
2734
2735 auto iter = db_->NewIterator(ReadOptions());
2736 iter->Seek("aaab");
2737 ASSERT_TRUE(iter->Valid());
2738 ASSERT_EQ(iter->key(), "aaae");
2739 delete iter;
2740 }
2741
2742 TEST_F(DBRangeDelTest, RefreshMemtableIter) {
2743 Options options = CurrentOptions();
2744 options.disable_auto_compactions = true;
2745 DestroyAndReopen(options);
2746 ASSERT_OK(
2747 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
2748 ReadOptions ro;
2749 ro.read_tier = kMemtableTier;
2750 std::unique_ptr<Iterator> iter{db_->NewIterator(ro)};
2751 ASSERT_OK(Flush());
2752 // First refresh reinits iter, which had a bug where
2753 // iter.memtable_range_tombstone_iter_ was not set to nullptr, and caused
2754 // subsequent refresh to double free.
2755 ASSERT_OK(iter->Refresh());
2756 ASSERT_OK(iter->Refresh());
2757 }
2758
2759 TEST_F(DBRangeDelTest, RangeTombstoneRespectIterateUpperBound) {
2760 // Memtable: a, [b, bz)
2761 // Do a Seek on `a` with iterate_upper_bound being az
2762 // range tombstone [b, bz) should not be processed (added to and
2763 // popped from the min_heap in MergingIterator).
2764 Options options = CurrentOptions();
2765 options.disable_auto_compactions = true;
2766 DestroyAndReopen(options);
2767
2768 ASSERT_OK(Put("a", "bar"));
2769 ASSERT_OK(
2770 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "bz"));
2771
2772 // I could not find a cleaner way to test this without relying on
2773 // implementation detail. Tried to test the value of
2774 // `internal_range_del_reseek_count` but that did not work
2775 // since BlockBasedTable iterator becomes !Valid() when point key
2776 // is out of bound and that reseek only happens when a point key
2777 // is covered by some range tombstone.
2778 SyncPoint::GetInstance()->SetCallBack("MergeIterator::PopDeleteRangeStart",
2779 [](void*) {
2780 // there should not be any range
2781 // tombstone in the heap.
2782 FAIL();
2783 });
2784 SyncPoint::GetInstance()->EnableProcessing();
2785
2786 ReadOptions read_opts;
2787 std::string upper_bound = "az";
2788 Slice upper_bound_slice = upper_bound;
2789 read_opts.iterate_upper_bound = &upper_bound_slice;
2790 std::unique_ptr<Iterator> iter{db_->NewIterator(read_opts)};
2791 iter->Seek("a");
2792 ASSERT_TRUE(iter->Valid());
2793 ASSERT_EQ(iter->key(), "a");
2794 iter->Next();
2795 ASSERT_FALSE(iter->Valid());
2796 ASSERT_OK(iter->status());
2797 }
2798
2799 #endif // ROCKSDB_LITE
2800
2801 } // namespace ROCKSDB_NAMESPACE
2802
2803 int main(int argc, char** argv) {
2804 ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
2805 ::testing::InitGoogleTest(&argc, argv);
2806 return RUN_ALL_TESTS();
2807 }