]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/db/db_range_del_test.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / rocksdb / db / db_range_del_test.cc
1 // Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
5
6 #include "db/db_test_util.h"
7 #include "port/stack_trace.h"
8 #include "util/testutil.h"
9 #include "utilities/merge_operators.h"
10
11 namespace rocksdb {
12
13 class DBRangeDelTest : public DBTestBase {
14 public:
15 DBRangeDelTest() : DBTestBase("/db_range_del_test") {}
16
17 std::string GetNumericStr(int key) {
18 uint64_t uint64_key = static_cast<uint64_t>(key);
19 std::string str;
20 str.resize(8);
21 memcpy(&str[0], static_cast<void*>(&uint64_key), 8);
22 return str;
23 }
24 };
25
26 // PlainTableFactory and NumTableFilesAtLevel() are not supported in
27 // ROCKSDB_LITE
28 #ifndef ROCKSDB_LITE
29 TEST_F(DBRangeDelTest, NonBlockBasedTableNotSupported) {
30 Options opts = CurrentOptions();
31 opts.table_factory.reset(new PlainTableFactory());
32 opts.prefix_extractor.reset(NewNoopTransform());
33 opts.allow_mmap_reads = true;
34 opts.max_sequential_skip_in_iterations = 999999;
35 Reopen(opts);
36
37 ASSERT_TRUE(
38 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1", "dr1")
39 .IsNotSupported());
40 }
41
42 TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) {
43 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
44 "dr2"));
45 ASSERT_OK(db_->Flush(FlushOptions()));
46 ASSERT_EQ(1, NumTableFilesAtLevel(0));
47 }
48
49 TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) {
50 Options opts = CurrentOptions();
51 opts.disable_auto_compactions = true;
52 opts.statistics = CreateDBStatistics();
53 Reopen(opts);
54
55 // snapshot protects range tombstone from dropping due to becoming obsolete.
56 const Snapshot* snapshot = db_->GetSnapshot();
57 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z");
58 db_->Flush(FlushOptions());
59
60 ASSERT_EQ(1, NumTableFilesAtLevel(0));
61 ASSERT_EQ(0, NumTableFilesAtLevel(1));
62 dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
63 true /* disallow_trivial_move */);
64 ASSERT_EQ(0, NumTableFilesAtLevel(0));
65 ASSERT_EQ(1, NumTableFilesAtLevel(1));
66 ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
67 db_->ReleaseSnapshot(snapshot);
68 }
69
70 TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) {
71 // regression test for exactly filled compaction output files. Previously
72 // another file would be generated containing all range deletions, which
73 // could invalidate the non-overlapping file boundary invariant.
74 const int kNumPerFile = 4, kNumFiles = 2, kFileBytes = 9 << 10;
75 Options options = CurrentOptions();
76 options.disable_auto_compactions = true;
77 options.level0_file_num_compaction_trigger = kNumFiles;
78 options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
79 options.num_levels = 2;
80 options.target_file_size_base = kFileBytes;
81 BlockBasedTableOptions table_options;
82 table_options.block_size_deviation = 50; // each block holds two keys
83 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
84 Reopen(options);
85
86 // snapshot protects range tombstone from dropping due to becoming obsolete.
87 const Snapshot* snapshot = db_->GetSnapshot();
88 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), Key(1));
89
90 Random rnd(301);
91 for (int i = 0; i < kNumFiles; ++i) {
92 std::vector<std::string> values;
93 // Write 12K (4 values, each 3K)
94 for (int j = 0; j < kNumPerFile; j++) {
95 values.push_back(RandomString(&rnd, 3 << 10));
96 ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
97 if (j == 0 && i > 0) {
98 dbfull()->TEST_WaitForFlushMemTable();
99 }
100 }
101 }
102 // put extra key to trigger final flush
103 ASSERT_OK(Put("", ""));
104 dbfull()->TEST_WaitForFlushMemTable();
105 ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
106 ASSERT_EQ(0, NumTableFilesAtLevel(1));
107
108 dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
109 true /* disallow_trivial_move */);
110 ASSERT_EQ(0, NumTableFilesAtLevel(0));
111 ASSERT_EQ(2, NumTableFilesAtLevel(1));
112 db_->ReleaseSnapshot(snapshot);
113 }
114
115 TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) {
116 // Ensures range deletion spanning multiple compaction output files that are
117 // cut by max_compaction_bytes will have non-overlapping key-ranges.
118 // https://github.com/facebook/rocksdb/issues/1778
119 const int kNumFiles = 2, kNumPerFile = 1 << 8, kBytesPerVal = 1 << 12;
120 Options opts = CurrentOptions();
121 opts.comparator = test::Uint64Comparator();
122 opts.disable_auto_compactions = true;
123 opts.level0_file_num_compaction_trigger = kNumFiles;
124 opts.max_compaction_bytes = kNumPerFile * kBytesPerVal;
125 opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
126 // Want max_compaction_bytes to trigger the end of compaction output file, not
127 // target_file_size_base, so make the latter much bigger
128 opts.target_file_size_base = 100 * opts.max_compaction_bytes;
129 Reopen(opts);
130
131 // snapshot protects range tombstone from dropping due to becoming obsolete.
132 const Snapshot* snapshot = db_->GetSnapshot();
133
134 // It spans the whole key-range, thus will be included in all output files
135 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
136 GetNumericStr(0),
137 GetNumericStr(kNumFiles * kNumPerFile - 1)));
138 Random rnd(301);
139 for (int i = 0; i < kNumFiles; ++i) {
140 std::vector<std::string> values;
141 // Write 1MB (256 values, each 4K)
142 for (int j = 0; j < kNumPerFile; j++) {
143 values.push_back(RandomString(&rnd, kBytesPerVal));
144 ASSERT_OK(Put(GetNumericStr(kNumPerFile * i + j), values[j]));
145 }
146 // extra entry to trigger SpecialSkipListFactory's flush
147 ASSERT_OK(Put(GetNumericStr(kNumPerFile), ""));
148 dbfull()->TEST_WaitForFlushMemTable();
149 ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
150 }
151
152 dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
153 true /* disallow_trivial_move */);
154 ASSERT_EQ(0, NumTableFilesAtLevel(0));
155 ASSERT_GE(NumTableFilesAtLevel(1), 2);
156
157 std::vector<std::vector<FileMetaData>> files;
158 dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
159
160 for (size_t i = 0; i < files[1].size() - 1; ++i) {
161 ASSERT_TRUE(InternalKeyComparator(opts.comparator)
162 .Compare(files[1][i].largest, files[1][i + 1].smallest) <
163 0);
164 }
165 db_->ReleaseSnapshot(snapshot);
166 }
167
168 TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) {
169 // Regression test for bug where sentinel range deletions (i.e., ones with
170 // sequence number of zero) were included in output files.
171 // snapshot protects range tombstone from dropping due to becoming obsolete.
172 const Snapshot* snapshot = db_->GetSnapshot();
173
174 // gaps between ranges creates sentinels in our internal representation
175 std::vector<std::pair<std::string, std::string>> range_dels = {{"a", "b"}, {"c", "d"}, {"e", "f"}};
176 for (const auto& range_del : range_dels) {
177 ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
178 range_del.first, range_del.second));
179 }
180 ASSERT_OK(db_->Flush(FlushOptions()));
181 ASSERT_EQ(1, NumTableFilesAtLevel(0));
182
183 std::vector<std::vector<FileMetaData>> files;
184 dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
185 ASSERT_GT(files[0][0].smallest_seqno, 0);
186
187 db_->ReleaseSnapshot(snapshot);
188 }
189
190 TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) {
191 db_->Put(WriteOptions(), "b1", "val");
192 ASSERT_OK(
193 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
194 db_->Put(WriteOptions(), "b2", "val");
195 ASSERT_OK(
196 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
197 // first iteration verifies query correctness in memtable, second verifies
198 // query correctness for a single SST file
199 for (int i = 0; i < 2; ++i) {
200 if (i > 0) {
201 ASSERT_OK(db_->Flush(FlushOptions()));
202 ASSERT_EQ(1, NumTableFilesAtLevel(0));
203 }
204 std::string value;
205 ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
206 ASSERT_OK(db_->Get(ReadOptions(), "b2", &value));
207 }
208 }
209
210 TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) {
211 db_->Put(WriteOptions(), "unused", "val"); // prevents empty after compaction
212 db_->Put(WriteOptions(), "b1", "val");
213 ASSERT_OK(db_->Flush(FlushOptions()));
214 ASSERT_OK(
215 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
216 ASSERT_OK(db_->Flush(FlushOptions()));
217 ASSERT_OK(
218 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
219 ASSERT_OK(db_->Flush(FlushOptions()));
220 ASSERT_EQ(3, NumTableFilesAtLevel(0));
221
222 for (int i = 0; i < 2; ++i) {
223 if (i > 0) {
224 dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
225 true /* disallow_trivial_move */);
226 ASSERT_EQ(0, NumTableFilesAtLevel(0));
227 ASSERT_EQ(1, NumTableFilesAtLevel(1));
228 }
229 std::string value;
230 ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
231 }
232 }
233 #endif // ROCKSDB_LITE
234
235 TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) {
236 const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250;
237 Options opts = CurrentOptions();
238 opts.comparator = test::Uint64Comparator();
239 Reopen(opts);
240
241 // Write a third before snapshot, a third between snapshot and tombstone, and
242 // a third after the tombstone. Keys older than snapshot or newer than the
243 // tombstone should be preserved.
244 const Snapshot* snapshot = nullptr;
245 for (int i = 0; i < kNum; ++i) {
246 if (i == kNum / 3) {
247 snapshot = db_->GetSnapshot();
248 } else if (i == 2 * kNum / 3) {
249 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
250 GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
251 }
252 db_->Put(WriteOptions(), GetNumericStr(i), "val");
253 }
254 db_->Flush(FlushOptions());
255
256 for (int i = 0; i < kNum; ++i) {
257 ReadOptions read_opts;
258 read_opts.ignore_range_deletions = true;
259 std::string value;
260 if (i < kRangeBegin || i > kRangeEnd || i < kNum / 3 || i >= 2 * kNum / 3) {
261 ASSERT_OK(db_->Get(read_opts, GetNumericStr(i), &value));
262 } else {
263 ASSERT_TRUE(db_->Get(read_opts, GetNumericStr(i), &value).IsNotFound());
264 }
265 }
266 db_->ReleaseSnapshot(snapshot);
267 }
268
269 // NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
270 #ifndef ROCKSDB_LITE
271 TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) {
272 const int kNumPerFile = 100, kNumFiles = 4;
273 Options opts = CurrentOptions();
274 opts.comparator = test::Uint64Comparator();
275 opts.disable_auto_compactions = true;
276 opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
277 opts.num_levels = 2;
278 opts.statistics = CreateDBStatistics();
279 Reopen(opts);
280
281 for (int i = 0; i < kNumFiles; ++i) {
282 if (i > 0) {
283 // range tombstone covers first half of the previous file
284 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
285 GetNumericStr((i - 1) * kNumPerFile),
286 GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2));
287 }
288 // Make sure a given key appears in each file so compaction won't be able to
289 // use trivial move, which would happen if the ranges were non-overlapping.
290 // Also, we need an extra element since flush is only triggered when the
291 // number of keys is one greater than SpecialSkipListFactory's limit.
292 // We choose a key outside the key-range used by the test to avoid conflict.
293 db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles), "val");
294
295 for (int j = 0; j < kNumPerFile; ++j) {
296 db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val");
297 }
298 dbfull()->TEST_WaitForFlushMemTable();
299 ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
300 }
301 db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
302 ASSERT_EQ(0, NumTableFilesAtLevel(0));
303 ASSERT_GT(NumTableFilesAtLevel(1), 0);
304 ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2,
305 TestGetTickerCount(opts, COMPACTION_KEY_DROP_RANGE_DEL));
306
307 for (int i = 0; i < kNumFiles; ++i) {
308 for (int j = 0; j < kNumPerFile; ++j) {
309 ReadOptions read_opts;
310 read_opts.ignore_range_deletions = true;
311 std::string value;
312 if (i == kNumFiles - 1 || j >= kNumPerFile / 2) {
313 ASSERT_OK(
314 db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value));
315 } else {
316 ASSERT_TRUE(
317 db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value)
318 .IsNotFound());
319 }
320 }
321 }
322 }
323
324 TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) {
325 const int kNumPerFile = 100, kNumFiles = 4, kFileBytes = 100 << 10;
326 Options options = CurrentOptions();
327 options.disable_auto_compactions = true;
328 options.level0_file_num_compaction_trigger = kNumFiles;
329 options.max_bytes_for_level_base = 2 * kFileBytes;
330 options.max_subcompactions = 4;
331 options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
332 options.num_levels = 3;
333 options.target_file_size_base = kFileBytes;
334 options.target_file_size_multiplier = 1;
335 Reopen(options);
336
337 Random rnd(301);
338 for (int i = 0; i < 2; ++i) {
339 for (int j = 0; j < kNumFiles; ++j) {
340 if (i > 0) {
341 // delete [95,105) in two files, [295,305) in next two
342 int mid = (j + (1 - j % 2)) * kNumPerFile;
343 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
344 Key(mid - 5), Key(mid + 5));
345 }
346 std::vector<std::string> values;
347 // Write 100KB (100 values, each 1K)
348 for (int k = 0; k < kNumPerFile; k++) {
349 values.push_back(RandomString(&rnd, 990));
350 ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
351 }
352 // put extra key to trigger flush
353 ASSERT_OK(Put("", ""));
354 dbfull()->TEST_WaitForFlushMemTable();
355 if (j < kNumFiles - 1) {
356 // background compaction may happen early for kNumFiles'th file
357 ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
358 }
359 if (j == options.level0_file_num_compaction_trigger - 1) {
360 // When i == 1, compaction will output some files to L1, at which point
361 // L1 is not bottommost so range deletions cannot be compacted away. The
362 // new L1 files must be generated with non-overlapping key ranges even
363 // though multiple subcompactions see the same ranges deleted, else an
364 // assertion will fail.
365 //
366 // Only enable auto-compactions when we're ready; otherwise, the
367 // oversized L0 (relative to base_level) causes the compaction to run
368 // earlier.
369 ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
370 dbfull()->TEST_WaitForCompact();
371 ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
372 {{"disable_auto_compactions", "true"}}));
373 ASSERT_EQ(NumTableFilesAtLevel(0), 0);
374 ASSERT_GT(NumTableFilesAtLevel(1), 0);
375 ASSERT_GT(NumTableFilesAtLevel(2), 0);
376 }
377 }
378 }
379 }
380
381 TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) {
382 const int kNumPerFile = 100, kFilesPerLevel = 4, kNumLevels = 4;
383 Options options = CurrentOptions();
384 options.compaction_options_universal.min_merge_width = kFilesPerLevel;
385 options.compaction_options_universal.max_merge_width = kFilesPerLevel;
386 options.compaction_options_universal.size_ratio = 10;
387 options.compaction_style = kCompactionStyleUniversal;
388 options.level0_file_num_compaction_trigger = kFilesPerLevel;
389 options.max_subcompactions = 4;
390 options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
391 options.num_levels = kNumLevels;
392 options.target_file_size_base = kNumPerFile << 10;
393 options.target_file_size_multiplier = 1;
394 Reopen(options);
395
396 Random rnd(301);
397 for (int i = 0; i < kNumLevels - 1; ++i) {
398 for (int j = 0; j < kFilesPerLevel; ++j) {
399 if (i == kNumLevels - 2) {
400 // insert range deletions [95,105) in two files, [295,305) in next two
401 // to prepare L1 for later manual compaction.
402 int mid = (j + (1 - j % 2)) * kNumPerFile;
403 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
404 Key(mid - 5), Key(mid + 5));
405 }
406 std::vector<std::string> values;
407 // Write 100KB (100 values, each 1K)
408 for (int k = 0; k < kNumPerFile; k++) {
409 values.push_back(RandomString(&rnd, 990));
410 ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
411 }
412 // put extra key to trigger flush
413 ASSERT_OK(Put("", ""));
414 dbfull()->TEST_WaitForFlushMemTable();
415 if (j < kFilesPerLevel - 1) {
416 // background compaction may happen early for kFilesPerLevel'th file
417 ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
418 }
419 }
420 dbfull()->TEST_WaitForCompact();
421 ASSERT_EQ(NumTableFilesAtLevel(0), 0);
422 ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1);
423 }
424 // Now L1-L3 are full, when we compact L1->L2 we should see (1) subcompactions
425 // happen since input level > 0; (2) range deletions are not dropped since
426 // output level is not bottommost. If no file boundary assertion fails, that
427 // probably means universal compaction + subcompaction + range deletion are
428 // compatible.
429 ASSERT_OK(dbfull()->RunManualCompaction(
430 reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
431 ->cfd(),
432 1 /* input_level */, 2 /* output_level */, 0 /* output_path_id */,
433 nullptr /* begin */, nullptr /* end */, true /* exclusive */,
434 true /* disallow_trivial_move */));
435 }
436 #endif // ROCKSDB_LITE
437
438 TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) {
439 const int kNumPerFile = 3, kNumFiles = 3;
440 Options opts = CurrentOptions();
441 opts.disable_auto_compactions = true;
442 opts.memtable_factory.reset(new SpecialSkipListFactory(2 * kNumPerFile));
443 opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
444 opts.num_levels = 2;
445 Reopen(opts);
446
447 // Iterates kNumFiles * kNumPerFile + 1 times since flushing the last file
448 // requires an extra entry.
449 for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) {
450 if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) {
451 // Delete merge operands from all but the last file
452 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
453 "key_");
454 }
455 std::string val;
456 PutFixed64(&val, i);
457 db_->Merge(WriteOptions(), "key", val);
458 // we need to prevent trivial move using Puts so compaction will actually
459 // process the merge operands.
460 db_->Put(WriteOptions(), "prevent_trivial_move", "");
461 if (i > 0 && i % kNumPerFile == 0) {
462 dbfull()->TEST_WaitForFlushMemTable();
463 }
464 }
465
466 ReadOptions read_opts;
467 read_opts.ignore_range_deletions = true;
468 std::string expected, actual;
469 ASSERT_OK(db_->Get(read_opts, "key", &actual));
470 PutFixed64(&expected, 45); // 1+2+...+9
471 ASSERT_EQ(expected, actual);
472
473 db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
474
475 expected.clear();
476 ASSERT_OK(db_->Get(read_opts, "key", &actual));
477 uint64_t tmp;
478 Slice tmp2(actual);
479 GetFixed64(&tmp2, &tmp);
480 PutFixed64(&expected, 30); // 6+7+8+9 (earlier operands covered by tombstone)
481 ASSERT_EQ(expected, actual);
482 }
483
484 // NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
485 #ifndef ROCKSDB_LITE
486 TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) {
487 // During compaction to bottommost level, verify range tombstones older than
488 // the oldest snapshot are removed, while others are preserved.
489 Options opts = CurrentOptions();
490 opts.disable_auto_compactions = true;
491 opts.num_levels = 2;
492 opts.statistics = CreateDBStatistics();
493 Reopen(opts);
494
495 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
496 "dr1"); // obsolete after compaction
497 db_->Put(WriteOptions(), "key", "val");
498 db_->Flush(FlushOptions());
499 const Snapshot* snapshot = db_->GetSnapshot();
500 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2",
501 "dr2"); // protected by snapshot
502 db_->Put(WriteOptions(), "key", "val");
503 db_->Flush(FlushOptions());
504
505 ASSERT_EQ(2, NumTableFilesAtLevel(0));
506 ASSERT_EQ(0, NumTableFilesAtLevel(1));
507 db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
508 ASSERT_EQ(0, NumTableFilesAtLevel(0));
509 ASSERT_EQ(1, NumTableFilesAtLevel(1));
510 ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
511
512 db_->ReleaseSnapshot(snapshot);
513 }
514
515 TEST_F(DBRangeDelTest, TableEvictedDuringScan) {
516 // The RangeDelAggregator holds pointers into range deletion blocks created by
517 // table readers. This test ensures the aggregator can still access those
518 // blocks even if it outlives the table readers that created them.
519 //
520 // DBIter always keeps readers open for L0 files. So, in order to test
521 // aggregator outliving reader, we need to have deletions in L1 files, which
522 // are opened/closed on-demand during the scan. This is accomplished by
523 // setting kNumRanges > level0_stop_writes_trigger, which prevents deletions
524 // from all lingering in L0 (there is at most one range deletion per L0 file).
525 //
526 // The first L1 file will contain a range deletion since its begin key is 0.
527 // SeekToFirst() references that table's reader and adds its range tombstone
528 // to the aggregator. Upon advancing beyond that table's key-range via Next(),
529 // the table reader will be unreferenced by the iterator. Since we manually
530 // call Evict() on all readers before the full scan, this unreference causes
531 // the reader's refcount to drop to zero and thus be destroyed.
532 //
533 // When it is destroyed, we do not remove its range deletions from the
534 // aggregator. So, subsequent calls to Next() must be able to use these
535 // deletions to decide whether a key is covered. This will work as long as
536 // the aggregator properly references the range deletion block.
537 const int kNum = 25, kRangeBegin = 0, kRangeEnd = 7, kNumRanges = 5;
538 Options opts = CurrentOptions();
539 opts.comparator = test::Uint64Comparator();
540 opts.level0_file_num_compaction_trigger = 4;
541 opts.level0_stop_writes_trigger = 4;
542 opts.memtable_factory.reset(new SpecialSkipListFactory(1));
543 opts.num_levels = 2;
544 BlockBasedTableOptions bbto;
545 bbto.cache_index_and_filter_blocks = true;
546 bbto.block_cache = NewLRUCache(8 << 20);
547 opts.table_factory.reset(NewBlockBasedTableFactory(bbto));
548 Reopen(opts);
549
550 // Hold a snapshot so range deletions can't become obsolete during compaction
551 // to bottommost level (i.e., L1).
552 const Snapshot* snapshot = db_->GetSnapshot();
553 for (int i = 0; i < kNum; ++i) {
554 db_->Put(WriteOptions(), GetNumericStr(i), "val");
555 if (i > 0) {
556 dbfull()->TEST_WaitForFlushMemTable();
557 }
558 if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) {
559 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
560 GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
561 }
562 }
563 // Must be > 1 so the first L1 file can be closed before scan finishes
564 dbfull()->TEST_WaitForCompact();
565 ASSERT_GT(NumTableFilesAtLevel(1), 1);
566 std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
567
568 ReadOptions read_opts;
569 auto* iter = db_->NewIterator(read_opts);
570 int expected = kRangeEnd;
571 iter->SeekToFirst();
572 for (auto file_number : file_numbers) {
573 // This puts table caches in the state of being externally referenced only
574 // so they are destroyed immediately upon iterator unreferencing.
575 TableCache::Evict(dbfull()->TEST_table_cache(), file_number);
576 }
577 for (; iter->Valid(); iter->Next()) {
578 ASSERT_EQ(GetNumericStr(expected), iter->key());
579 ++expected;
580 // Keep clearing block cache's LRU so range deletion block can be freed as
581 // soon as its refcount drops to zero.
582 bbto.block_cache->EraseUnRefEntries();
583 }
584 ASSERT_EQ(kNum, expected);
585 delete iter;
586 db_->ReleaseSnapshot(snapshot);
587 }
588
589 TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) {
590 db_->Put(WriteOptions(), "key", "val");
591 ASSERT_OK(
592 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
593
594 ReadOptions read_opts;
595 std::string value;
596 ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
597 }
598
599 TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) {
600 Options opts = CurrentOptions();
601 opts.max_write_buffer_number = 3;
602 opts.min_write_buffer_number_to_merge = 2;
603 // SpecialSkipListFactory lets us specify maximum number of elements the
604 // memtable can hold. It switches the active memtable to immutable (flush is
605 // prevented by the above options) upon inserting an element that would
606 // overflow the memtable.
607 opts.memtable_factory.reset(new SpecialSkipListFactory(1));
608 Reopen(opts);
609
610 db_->Put(WriteOptions(), "key", "val");
611 ASSERT_OK(
612 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
613 db_->Put(WriteOptions(), "blah", "val");
614
615 ReadOptions read_opts;
616 std::string value;
617 ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
618 }
619
620 TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) {
621 db_->Put(WriteOptions(), "key", "val");
622 // snapshot prevents key from being deleted during flush
623 const Snapshot* snapshot = db_->GetSnapshot();
624 ASSERT_OK(
625 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
626 ASSERT_OK(db_->Flush(FlushOptions()));
627
628 ReadOptions read_opts;
629 std::string value;
630 ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
631 db_->ReleaseSnapshot(snapshot);
632 }
633
634 TEST_F(DBRangeDelTest, GetCoveredMergeOperandFromMemtable) {
635 const int kNumMergeOps = 10;
636 Options opts = CurrentOptions();
637 opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
638 Reopen(opts);
639
640 for (int i = 0; i < kNumMergeOps; ++i) {
641 std::string val;
642 PutFixed64(&val, i);
643 db_->Merge(WriteOptions(), "key", val);
644 if (i == kNumMergeOps / 2) {
645 // deletes [0, 5]
646 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
647 "key_");
648 }
649 }
650
651 ReadOptions read_opts;
652 std::string expected, actual;
653 ASSERT_OK(db_->Get(read_opts, "key", &actual));
654 PutFixed64(&expected, 30); // 6+7+8+9
655 ASSERT_EQ(expected, actual);
656
657 expected.clear();
658 read_opts.ignore_range_deletions = true;
659 ASSERT_OK(db_->Get(read_opts, "key", &actual));
660 PutFixed64(&expected, 45); // 0+1+2+...+9
661 ASSERT_EQ(expected, actual);
662 }
663
664 TEST_F(DBRangeDelTest, GetIgnoresRangeDeletions) {
665 Options opts = CurrentOptions();
666 opts.max_write_buffer_number = 4;
667 opts.min_write_buffer_number_to_merge = 3;
668 opts.memtable_factory.reset(new SpecialSkipListFactory(1));
669 Reopen(opts);
670
671 db_->Put(WriteOptions(), "sst_key", "val");
672 // snapshot prevents key from being deleted during flush
673 const Snapshot* snapshot = db_->GetSnapshot();
674 ASSERT_OK(
675 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
676 ASSERT_OK(db_->Flush(FlushOptions()));
677 db_->Put(WriteOptions(), "imm_key", "val");
678 ASSERT_OK(
679 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
680 db_->Put(WriteOptions(), "mem_key", "val");
681 ASSERT_OK(
682 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
683
684 ReadOptions read_opts;
685 read_opts.ignore_range_deletions = true;
686 for (std::string key : {"sst_key", "imm_key", "mem_key"}) {
687 std::string value;
688 ASSERT_OK(db_->Get(read_opts, key, &value));
689 }
690 db_->ReleaseSnapshot(snapshot);
691 }
692
693 TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) {
694 const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
695 Options opts = CurrentOptions();
696 opts.comparator = test::Uint64Comparator();
697 opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
698 Reopen(opts);
699
700 // Write half of the keys before the tombstone and half after the tombstone.
701 // Only covered keys (i.e., within the range and older than the tombstone)
702 // should be deleted.
703 for (int i = 0; i < kNum; ++i) {
704 if (i == kNum / 2) {
705 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
706 GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
707 }
708 db_->Put(WriteOptions(), GetNumericStr(i), "val");
709 }
710 ReadOptions read_opts;
711 auto* iter = db_->NewIterator(read_opts);
712
713 int expected = 0;
714 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
715 ASSERT_EQ(GetNumericStr(expected), iter->key());
716 if (expected == kRangeBegin - 1) {
717 expected = kNum / 2;
718 } else {
719 ++expected;
720 }
721 }
722 ASSERT_EQ(kNum, expected);
723 delete iter;
724 }
725
726 TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) {
727 const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
728 Options opts = CurrentOptions();
729 opts.comparator = test::Uint64Comparator();
730 opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
731 Reopen(opts);
732
733 const Snapshot* snapshot = nullptr;
734 // Put a snapshot before the range tombstone, verify an iterator using that
735 // snapshot sees all inserted keys.
736 for (int i = 0; i < kNum; ++i) {
737 if (i == kNum / 2) {
738 snapshot = db_->GetSnapshot();
739 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
740 GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
741 }
742 db_->Put(WriteOptions(), GetNumericStr(i), "val");
743 }
744 ReadOptions read_opts;
745 read_opts.snapshot = snapshot;
746 auto* iter = db_->NewIterator(read_opts);
747
748 int expected = 0;
749 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
750 ASSERT_EQ(GetNumericStr(expected), iter->key());
751 ++expected;
752 }
753 ASSERT_EQ(kNum / 2, expected);
754 delete iter;
755 db_->ReleaseSnapshot(snapshot);
756 }
757
758 TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) {
759 Options opts = CurrentOptions();
760 opts.max_write_buffer_number = 4;
761 opts.min_write_buffer_number_to_merge = 3;
762 opts.memtable_factory.reset(new SpecialSkipListFactory(1));
763 Reopen(opts);
764
765 db_->Put(WriteOptions(), "sst_key", "val");
766 // snapshot prevents key from being deleted during flush
767 const Snapshot* snapshot = db_->GetSnapshot();
768 ASSERT_OK(
769 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
770 ASSERT_OK(db_->Flush(FlushOptions()));
771 db_->Put(WriteOptions(), "imm_key", "val");
772 ASSERT_OK(
773 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
774 db_->Put(WriteOptions(), "mem_key", "val");
775 ASSERT_OK(
776 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
777
778 ReadOptions read_opts;
779 read_opts.ignore_range_deletions = true;
780 auto* iter = db_->NewIterator(read_opts);
781 int i = 0;
782 std::string expected[] = {"imm_key", "mem_key", "sst_key"};
783 for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) {
784 std::string key;
785 ASSERT_EQ(expected[i], iter->key());
786 }
787 ASSERT_EQ(3, i);
788 delete iter;
789 db_->ReleaseSnapshot(snapshot);
790 }
791
792 TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) {
793 db_->Put(WriteOptions(), "key", "val");
794 // snapshot prevents key from being deleted during flush
795 const Snapshot* snapshot = db_->GetSnapshot();
796 ASSERT_OK(
797 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
798
799 // iterations check unsupported in memtable, l0, and then l1
800 for (int i = 0; i < 3; ++i) {
801 ReadOptions read_opts;
802 read_opts.tailing = true;
803 auto* iter = db_->NewIterator(read_opts);
804 if (i == 2) {
805 // For L1+, iterators over files are created on-demand, so need seek
806 iter->SeekToFirst();
807 }
808 ASSERT_TRUE(iter->status().IsNotSupported());
809 delete iter;
810 if (i == 0) {
811 ASSERT_OK(db_->Flush(FlushOptions()));
812 } else if (i == 1) {
813 MoveFilesToLevel(1);
814 }
815 }
816 db_->ReleaseSnapshot(snapshot);
817 }
818 #endif // ROCKSDB_LITE
819
820 } // namespace rocksdb
821
822 int main(int argc, char** argv) {
823 rocksdb::port::InstallStackTraceHandler();
824 ::testing::InitGoogleTest(&argc, argv);
825 return RUN_ALL_TESTS();
826 }