2 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3 from __future__
import absolute_import
, division
, print_function
, unicode_literals
15 # params overwrite priority:
17 # default_params < {blackbox,whitebox}_default_params < args
19 # default_params < {blackbox,whitebox}_default_params <
20 # simple_default_params <
21 # {blackbox,whitebox}_simple_default_params < args
23 # default_params < {blackbox,whitebox}_default_params <
24 # cf_consistency_params < args
26 # default_params < {blackbox,whitebox}_default_params < txn_params < args
28 # default_params < {blackbox,whitebox}_default_params < ts_params < args
30 # default_params < {blackbox,whitebox}_default_params < multiops_txn_params < args
34 "acquire_snapshot_one_in": 10000,
35 "backup_max_size": 100 * 1024 * 1024,
36 # Consider larger number when backups considered more stable
37 "backup_one_in": 100000,
38 "batch_protection_bytes_per_key": lambda: random
.choice([0, 8]),
39 "memtable_protection_bytes_per_key": lambda: random
.choice([0, 1, 2, 4, 8]),
41 "bloom_bits": lambda: random
.choice(
42 [random
.randint(0, 19), random
.lognormvariate(2.3, 1.3)]
44 "cache_index_and_filter_blocks": lambda: random
.randint(0, 1),
45 "cache_size": 8388608,
46 "charge_compression_dictionary_building_buffer": lambda: random
.choice([0, 1]),
47 "charge_filter_construction": lambda: random
.choice([0, 1]),
48 "charge_table_reader": lambda: random
.choice([0, 1]),
49 "charge_file_metadata": lambda: random
.choice([0, 1]),
50 "checkpoint_one_in": 1000000,
51 "compression_type": lambda: random
.choice(
52 ["none", "snappy", "zlib", "lz4", "lz4hc", "xpress", "zstd"]
54 "bottommost_compression_type": lambda: "disable"
55 if random
.randint(0, 1) == 0
56 else random
.choice(["none", "snappy", "zlib", "lz4", "lz4hc", "xpress", "zstd"]),
57 "checksum_type": lambda: random
.choice(
58 ["kCRC32c", "kxxHash", "kxxHash64", "kXXH3"]
60 "compression_max_dict_bytes": lambda: 16384 * random
.randint(0, 1),
61 "compression_zstd_max_train_bytes": lambda: 65536 * random
.randint(0, 1),
62 # Disabled compression_parallel_threads as the feature is not stable
63 # lambda: random.choice([1] * 9 + [4])
64 "compression_parallel_threads": 1,
65 "compression_max_dict_buffer_bytes": lambda: (1 << random
.randint(0, 40)) - 1,
66 "compression_use_zstd_dict_trainer": lambda: random
.randint(0, 1),
67 "clear_column_family_one_in": 0,
68 "compact_files_one_in": 1000000,
69 "compact_range_one_in": 1000000,
70 "compaction_pri": random
.randint(0, 4),
71 "data_block_index_type": lambda: random
.choice([0, 1]),
74 "destroy_db_initially": 0,
75 "enable_pipelined_write": lambda: random
.randint(0, 1),
76 "enable_compaction_filter": lambda: random
.choice([0, 0, 0, 1]),
77 "expected_values_dir": lambda: setup_expected_values_dir(),
78 "fail_if_options_file_error": lambda: random
.randint(0, 1),
79 "flush_one_in": 1000000,
80 "manual_wal_flush_one_in": lambda: random
.choice([0, 0, 1000, 1000000]),
81 "file_checksum_impl": lambda: random
.choice(["none", "crc32c", "xxh64", "big"]),
82 "get_live_files_one_in": 1000000,
83 # Note: the following two are intentionally disabled as the corresponding
84 # APIs are not guaranteed to succeed.
85 "get_sorted_wal_files_one_in": 0,
86 "get_current_wal_file_one_in": 0,
87 # Temporarily disable hash index
88 "index_type": lambda: random
.choice([0, 0, 0, 2, 2, 3]),
89 "ingest_external_file_one_in": 1000000,
91 "mark_for_compaction_one_file_in": lambda: 10 * random
.randint(0, 1),
92 "max_background_compactions": 20,
93 "max_bytes_for_level_base": 10485760,
95 "max_write_buffer_number": 3,
96 "mmap_read": lambda: random
.randint(0, 1),
97 # Setting `nooverwritepercent > 0` is only possible because we do not vary
98 # the random seed, so the same keys are chosen by every run for disallowing
100 "nooverwritepercent": 1,
101 "open_files": lambda: random
.choice([-1, -1, 100, 500000]),
102 "optimize_filters_for_memory": lambda: random
.randint(0, 1),
103 "partition_filters": lambda: random
.randint(0, 1),
104 "partition_pinning": lambda: random
.randint(0, 3),
105 "pause_background_one_in": 1000000,
106 "prefix_size": lambda: random
.choice([-1, 1, 5, 7, 8]),
108 "progress_reports": 0,
110 "recycle_log_file_num": lambda: random
.randint(0, 1),
111 "snapshot_hold_ops": 100000,
112 "sst_file_manager_bytes_per_sec": lambda: random
.choice([0, 104857600]),
113 "sst_file_manager_bytes_per_truncate": lambda: random
.choice([0, 1048576]),
114 "long_running_snapshots": lambda: random
.randint(0, 1),
115 "subcompactions": lambda: random
.randint(1, 4),
116 "target_file_size_base": 2097152,
117 "target_file_size_multiplier": 2,
118 "test_batches_snapshots": random
.randint(0, 1),
119 "top_level_index_pinning": lambda: random
.randint(0, 3),
120 "unpartitioned_pinning": lambda: random
.randint(0, 3),
121 "use_direct_reads": lambda: random
.randint(0, 1),
122 "use_direct_io_for_flush_and_compaction": lambda: random
.randint(0, 1),
123 "mock_direct_io": False,
124 "cache_type": lambda: random
.choice(["lru_cache", "hyper_clock_cache"]),
125 "use_full_merge_v1": lambda: random
.randint(0, 1),
126 "use_merge": lambda: random
.randint(0, 1),
127 # use_put_entity_one_in has to be the same across invocations for verification to work, hence no lambda
128 "use_put_entity_one_in": random
.choice([0] * 7 + [1, 5, 10]),
129 # 999 -> use Bloom API
130 "ribbon_starting_level": lambda: random
.choice([random
.randint(-1, 10), 999]),
131 "value_size_mult": 32,
132 "verify_checksum": 1,
133 "write_buffer_size": 4 * 1024 * 1024,
135 "format_version": lambda: random
.choice([2, 3, 4, 5, 5]),
136 "index_block_restart_interval": lambda: random
.choice(range(1, 16)),
137 "use_multiget": lambda: random
.randint(0, 1),
138 "periodic_compaction_seconds": lambda: random
.choice([0, 0, 1, 2, 10, 100, 1000]),
139 # 0 = never (used by some), 10 = often (for threading bugs), 600 = default
140 "stats_dump_period_sec": lambda: random
.choice([0, 10, 600]),
141 "compaction_ttl": lambda: random
.choice([0, 0, 1, 2, 10, 100, 1000]),
142 # Test small max_manifest_file_size in a smaller chance, as most of the
143 # time we wnat manifest history to be preserved to help debug
144 "max_manifest_file_size": lambda: random
.choice(
145 [t
* 16384 if t
< 3 else 1024 * 1024 * 1024 for t
in range(1, 30)]
147 # Sync mode might make test runs slower so running it in a smaller chance
148 "sync": lambda: random
.choice([1 if t
== 0 else 0 for t
in range(0, 20)]),
149 "bytes_per_sync": lambda: random
.choice([0, 262144]),
150 "wal_bytes_per_sync": lambda: random
.choice([0, 524288]),
151 # Disable compaction_readahead_size because the test is not passing.
152 # "compaction_readahead_size" : lambda : random.choice(
153 # [0, 0, 1024 * 1024]),
154 "db_write_buffer_size": lambda: random
.choice(
155 [0, 0, 0, 1024 * 1024, 8 * 1024 * 1024, 128 * 1024 * 1024]
157 "avoid_unnecessary_blocking_io": random
.randint(0, 1),
158 "write_dbid_to_manifest": random
.randint(0, 1),
159 "avoid_flush_during_recovery": lambda: random
.choice(
160 [1 if t
== 0 else 0 for t
in range(0, 8)]
162 "max_write_batch_group_size_bytes": lambda: random
.choice(
163 [16, 64, 1024 * 1024, 16 * 1024 * 1024]
165 "level_compaction_dynamic_level_bytes": True,
166 "verify_checksum_one_in": 1000000,
167 "verify_db_one_in": 100000,
168 "continuous_verification_interval": 0,
170 "key_len_percent_dist": "1,30,69",
171 "read_fault_one_in": lambda: random
.choice([0, 32, 1000]),
172 "open_metadata_write_fault_one_in": lambda: random
.choice([0, 0, 8]),
173 "open_write_fault_one_in": lambda: random
.choice([0, 0, 16]),
174 "open_read_fault_one_in": lambda: random
.choice([0, 0, 32]),
175 "sync_fault_injection": lambda: random
.randint(0, 1),
176 "get_property_one_in": 1000000,
177 "paranoid_file_checks": lambda: random
.choice([0, 1, 1, 1]),
178 "max_write_buffer_size_to_maintain": lambda: random
.choice(
179 [0, 1024 * 1024, 2 * 1024 * 1024, 4 * 1024 * 1024, 8 * 1024 * 1024]
181 "user_timestamp_size": 0,
182 "secondary_cache_fault_one_in": lambda: random
.choice([0, 0, 32]),
183 "prepopulate_block_cache": lambda: random
.choice([0, 1]),
184 "memtable_prefix_bloom_size_ratio": lambda: random
.choice([0.001, 0.01, 0.1, 0.5]),
185 "memtable_whole_key_filtering": lambda: random
.randint(0, 1),
186 "detect_filter_construct_corruption": lambda: random
.choice([0, 1]),
187 "adaptive_readahead": lambda: random
.choice([0, 1]),
188 "async_io": lambda: random
.choice([0, 1]),
189 "wal_compression": lambda: random
.choice(["none", "zstd"]),
190 "verify_sst_unique_id_in_manifest": 1, # always do unique_id verification
191 "secondary_cache_uri": lambda: random
.choice(
194 "compressed_secondary_cache://capacity=8388608",
195 "compressed_secondary_cache://capacity=8388608;enable_custom_split_merge=true",
198 "allow_data_in_errors": True,
199 "readahead_size": lambda: random
.choice([0, 16384, 524288]),
200 "initial_auto_readahead_size": lambda: random
.choice([0, 16384, 524288]),
201 "max_auto_readahead_size": lambda: random
.choice([0, 16384, 524288]),
202 "num_file_reads_for_auto_readahead": lambda: random
.choice([0, 1, 2]),
203 "min_write_buffer_number_to_merge": lambda: random
.choice([1, 2]),
204 "preserve_internal_time_seconds": lambda: random
.choice([0, 60, 3600, 36000]),
207 _TEST_DIR_ENV_VAR
= "TEST_TMPDIR"
208 _DEBUG_LEVEL_ENV_VAR
= "DEBUG_LEVEL"
210 stress_cmd
= "./db_stress"
214 def is_release_mode():
215 return os
.environ
.get(_DEBUG_LEVEL_ENV_VAR
) == "0"
218 def get_dbname(test_name
):
219 test_dir_name
= "rocksdb_crashtest_" + test_name
220 test_tmpdir
= os
.environ
.get(_TEST_DIR_ENV_VAR
)
221 if test_tmpdir
is None or test_tmpdir
== "":
222 dbname
= tempfile
.mkdtemp(prefix
=test_dir_name
)
224 dbname
= test_tmpdir
+ "/" + test_dir_name
225 shutil
.rmtree(dbname
, True)
226 if cleanup_cmd
is not None:
227 print("Running DB cleanup command - %s\n" % cleanup_cmd
)
229 os
.system(cleanup_cmd
)
234 expected_values_dir
= None
237 def setup_expected_values_dir():
238 global expected_values_dir
239 if expected_values_dir
is not None:
240 return expected_values_dir
241 expected_dir_prefix
= "rocksdb_crashtest_expected_"
242 test_tmpdir
= os
.environ
.get(_TEST_DIR_ENV_VAR
)
243 if test_tmpdir
is None or test_tmpdir
== "":
244 expected_values_dir
= tempfile
.mkdtemp(prefix
=expected_dir_prefix
)
246 # if tmpdir is specified, store the expected_values_dir under that dir
247 expected_values_dir
= test_tmpdir
+ "/rocksdb_crashtest_expected"
248 if os
.path
.exists(expected_values_dir
):
249 shutil
.rmtree(expected_values_dir
)
250 os
.mkdir(expected_values_dir
)
251 return expected_values_dir
254 multiops_txn_key_spaces_file
= None
257 def setup_multiops_txn_key_spaces_file():
258 global multiops_txn_key_spaces_file
259 if multiops_txn_key_spaces_file
is not None:
260 return multiops_txn_key_spaces_file
261 key_spaces_file_prefix
= "rocksdb_crashtest_multiops_txn_key_spaces"
262 test_tmpdir
= os
.environ
.get(_TEST_DIR_ENV_VAR
)
263 if test_tmpdir
is None or test_tmpdir
== "":
264 multiops_txn_key_spaces_file
= tempfile
.mkstemp(prefix
=key_spaces_file_prefix
)[
268 if not os
.path
.exists(test_tmpdir
):
269 os
.mkdir(test_tmpdir
)
270 multiops_txn_key_spaces_file
= tempfile
.mkstemp(
271 prefix
=key_spaces_file_prefix
, dir=test_tmpdir
273 return multiops_txn_key_spaces_file
276 def is_direct_io_supported(dbname
):
277 with tempfile
.NamedTemporaryFile(dir=dbname
) as f
:
279 os
.open(f
.name
, os
.O_DIRECT
)
280 except BaseException
:
285 blackbox_default_params
= {
286 "disable_wal": lambda: random
.choice([0, 0, 0, 1]),
287 # total time for this script to test db_stress
289 # time for one db_stress instance to run
291 # since we will be killing anyway, use large value for ops_per_thread
292 "ops_per_thread": 100000000,
294 "set_options_one_in": 10000,
297 whitebox_default_params
= {
298 # TODO: enable this once we figure out how to adjust kill odds for WAL-
299 # disabled runs, and either (1) separate full `db_stress` runs out of
300 # whitebox crash or (2) support verification at end of `db_stress` runs
301 # that ran with WAL disabled.
304 "log2_keys_per_lock": 10,
305 "ops_per_thread": 200000,
306 "random_kill_odd": 888887,
310 simple_default_params
= {
311 "allow_concurrent_memtable_write": lambda: random
.randint(0, 1),
312 "column_families": 1,
313 # TODO: re-enable once internal task T124324915 is fixed.
314 # "experimental_mempurge_threshold": lambda: 10.0*random.random(),
315 "max_background_compactions": 1,
316 "max_bytes_for_level_base": 67108864,
317 "memtablerep": "skip_list",
318 "target_file_size_base": 16777216,
319 "target_file_size_multiplier": 1,
320 "test_batches_snapshots": 0,
321 "write_buffer_size": 32 * 1024 * 1024,
322 "level_compaction_dynamic_level_bytes": False,
323 "paranoid_file_checks": lambda: random
.choice([0, 1, 1, 1]),
324 "verify_iterator_with_expected_state_one_in": 5, # this locks a range of keys
327 blackbox_simple_default_params
= {
329 "set_options_one_in": 0,
332 whitebox_simple_default_params
= {}
334 cf_consistency_params
= {
335 "disable_wal": lambda: random
.randint(0, 1),
337 "test_cf_consistency": 1,
338 # use small value for write_buffer_size so that RocksDB triggers flush
340 "write_buffer_size": 1024 * 1024,
341 "enable_pipelined_write": lambda: random
.randint(0, 1),
342 # Snapshots are used heavily in this test mode, while they are incompatible
343 # with compaction filter.
344 "enable_compaction_filter": 0,
345 # `CfConsistencyStressTest::TestIngestExternalFile()` is not implemented.
346 "ingest_external_file_one_in": 0,
351 # Avoid lambda to set it once for the entire test
352 "txn_write_policy": random
.randint(0, 2),
353 "unordered_write": random
.randint(0, 1),
354 # TODO: there is such a thing as transactions with WAL disabled. We should
357 # OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns
358 "checkpoint_one_in": 0,
359 # pipeline write is not currnetly compatible with WritePrepared txns
360 "enable_pipelined_write": 0,
361 "create_timestamped_snapshot_one_in": random
.choice([0, 20]),
362 # PutEntity in transactions is not yet implemented
363 "use_put_entity_one_in" : 0,
366 best_efforts_recovery_params
= {
367 "best_efforts_recovery": 1,
370 "column_families": 1,
374 "allow_setting_blob_options_dynamically": 1,
375 # Enable blob files and GC with a 75% chance initially; note that they might still be
376 # enabled/disabled during the test via SetOptions
377 "enable_blob_files": lambda: random
.choice([0] + [1] * 3),
378 "min_blob_size": lambda: random
.choice([0, 8, 16]),
379 "blob_file_size": lambda: random
.choice([1048576, 16777216, 268435456, 1073741824]),
380 "blob_compression_type": lambda: random
.choice(["none", "snappy", "lz4", "zstd"]),
381 "enable_blob_garbage_collection": lambda: random
.choice([0] + [1] * 3),
382 "blob_garbage_collection_age_cutoff": lambda: random
.choice(
383 [0.0, 0.25, 0.5, 0.75, 1.0]
385 "blob_garbage_collection_force_threshold": lambda: random
.choice([0.5, 0.75, 1.0]),
386 "blob_compaction_readahead_size": lambda: random
.choice([0, 1048576, 4194304]),
387 "blob_file_starting_level": lambda: random
.choice(
388 [0] * 4 + [1] * 3 + [2] * 2 + [3]
390 "use_blob_cache": lambda: random
.randint(0, 1),
391 "use_shared_block_and_blob_cache": lambda: random
.randint(0, 1),
392 "blob_cache_size": lambda: random
.choice([1048576, 2097152, 4194304, 8388608]),
393 "prepopulate_blob_cache": lambda: random
.randint(0, 1),
397 "test_cf_consistency": 0,
398 "test_batches_snapshots": 0,
399 "user_timestamp_size": 8,
401 "use_full_merge_v1": 0,
403 "enable_blob_files": 0,
405 "ingest_external_file_one_in": 0,
406 # PutEntity with timestamps is not yet implemented
407 "use_put_entity_one_in" : 0,
411 "enable_tiered_storage": 1,
412 # Set tiered compaction hot data time as: 1 minute, 1 hour, 10 hour
413 "preclude_last_level_data_seconds": lambda: random
.choice([60, 3600, 36000]),
414 # only test universal compaction for now, level has known issue of
416 "compaction_style": 1,
417 # tiered storage doesn't support blob db yet
418 "enable_blob_files": 0,
422 multiops_txn_default_params
= {
423 "test_cf_consistency": 0,
424 "test_batches_snapshots": 0,
425 "test_multi_ops_txns": 1,
427 "two_write_queues": lambda: random
.choice([0, 1]),
428 # TODO: enable write-prepared
430 "use_only_the_last_commit_time_batch_for_recovery": lambda: random
.choice([0, 1]),
431 "clear_column_family_one_in": 0,
432 "column_families": 1,
433 "enable_pipelined_write": lambda: random
.choice([0, 1]),
434 # This test already acquires snapshots in reads
435 "acquire_snapshot_one_in": 0,
439 "delrangepercent": 0,
440 "customopspercent": 80,
444 "verify_db_one_in": 1000,
445 "continuous_verification_interval": 1000,
446 "delay_snapshot_read_one_in": 3,
447 # 65536 is the smallest possible value for write_buffer_size. Smaller
448 # values will be sanitized to 65536 during db open. SetOptions currently
449 # does not sanitize options, but very small write_buffer_size may cause
450 # assertion failure in
451 # https://github.com/facebook/rocksdb/blob/7.0.fb/db/memtable.cc#L117.
452 "write_buffer_size": 65536,
453 # flush more frequently to generate more files, thus trigger more
455 "flush_one_in": 1000,
456 "key_spaces_path": setup_multiops_txn_key_spaces_file(),
457 "rollback_one_in": 4,
458 # Re-enable once we have a compaction for MultiOpsTxnStressTest
459 "enable_compaction_filter": 0,
460 "create_timestamped_snapshot_one_in": 50,
461 "sync_fault_injection": 0,
462 # PutEntity in transactions is not yet implemented
463 "use_put_entity_one_in" : 0,
466 multiops_wc_txn_params
= {
467 "txn_write_policy": 0,
468 # TODO re-enable pipelined write. Not well tested atm
469 "enable_pipelined_write": 0,
472 multiops_wp_txn_params
= {
473 "txn_write_policy": 1,
474 "wp_snapshot_cache_bits": 1,
475 # try small wp_commit_cache_bits, e.g. 0 once we explore storing full
476 # commit sequence numbers in commit cache
477 "wp_commit_cache_bits": 10,
478 # pipeline write is not currnetly compatible with WritePrepared txns
479 "enable_pipelined_write": 0,
480 # OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns
481 "checkpoint_one_in": 0,
482 # Required to be 1 in order to use commit-time-batch
483 "use_only_the_last_commit_time_batch_for_recovery": 1,
484 "clear_wp_commit_cache_one_in": 10,
485 "create_timestamped_snapshot_one_in": 0,
489 def finalize_and_sanitize(src_params
):
490 dest_params
= {k
: v() if callable(v
) else v
for (k
, v
) in src_params
.items()}
491 if is_release_mode():
492 dest_params
["read_fault_one_in"] = 0
493 if dest_params
.get("compression_max_dict_bytes") == 0:
494 dest_params
["compression_zstd_max_train_bytes"] = 0
495 dest_params
["compression_max_dict_buffer_bytes"] = 0
496 if dest_params
.get("compression_type") != "zstd":
497 dest_params
["compression_zstd_max_train_bytes"] = 0
498 if dest_params
.get("allow_concurrent_memtable_write", 1) == 1:
499 dest_params
["memtablerep"] = "skip_list"
500 if dest_params
["mmap_read"] == 1:
501 dest_params
["use_direct_io_for_flush_and_compaction"] = 0
502 dest_params
["use_direct_reads"] = 0
503 if dest_params
["file_checksum_impl"] != "none":
504 # TODO(T109283569): there is a bug in `GenerateOneFileChecksum()`,
505 # used by `IngestExternalFile()`, causing it to fail with mmap
506 # reads. Remove this once it is fixed.
507 dest_params
["ingest_external_file_one_in"] = 0
509 dest_params
["use_direct_io_for_flush_and_compaction"] == 1
510 or dest_params
["use_direct_reads"] == 1
511 ) and not is_direct_io_supported(dest_params
["db"]):
512 if is_release_mode():
514 "{} does not support direct IO. Disabling use_direct_reads and "
515 "use_direct_io_for_flush_and_compaction.\n".format(dest_params
["db"])
517 dest_params
["use_direct_reads"] = 0
518 dest_params
["use_direct_io_for_flush_and_compaction"] = 0
520 dest_params
["mock_direct_io"] = True
522 if dest_params
["test_batches_snapshots"] == 1:
523 dest_params
["enable_compaction_filter"] = 0
524 if dest_params
["prefix_size"] < 0:
525 dest_params
["prefix_size"] = 1
527 # Multi-key operations are not currently compatible with transactions or
529 if (dest_params
.get("test_batches_snapshots") == 1 or
530 dest_params
.get("use_txn") == 1 or
531 dest_params
.get("user_timestamp_size") > 0):
532 dest_params
["ingest_external_file_one_in"] = 0
533 if (dest_params
.get("test_batches_snapshots") == 1 or
534 dest_params
.get("use_txn") == 1):
535 dest_params
["delpercent"] += dest_params
["delrangepercent"]
536 dest_params
["delrangepercent"] = 0
538 dest_params
.get("disable_wal") == 1
539 or dest_params
.get("sync_fault_injection") == 1
540 or dest_params
.get("manual_wal_flush_one_in") > 0
542 # File ingestion does not guarantee prefix-recoverability when unsynced
543 # data can be lost. Ingesting a file syncs data immediately that is
544 # newer than unsynced memtable data that can be lost on restart.
546 # Even if the above issue is fixed or worked around, our
547 # trace-and-replay does not trace file ingestion, so in its current form
548 # it would not recover the expected state to the correct point in time.
549 dest_params
["ingest_external_file_one_in"] = 0
550 # The `DbStressCompactionFilter` can apply memtable updates to SST
551 # files, which would be problematic when unsynced data can be lost in
553 dest_params
["enable_compaction_filter"] = 0
554 # Only under WritePrepared txns, unordered_write would provide the same guarnatees as vanilla rocksdb
555 if dest_params
.get("unordered_write", 0) == 1:
556 dest_params
["txn_write_policy"] = 1
557 dest_params
["allow_concurrent_memtable_write"] = 1
558 if dest_params
.get("disable_wal", 0) == 1:
559 dest_params
["atomic_flush"] = 1
560 dest_params
["sync"] = 0
561 dest_params
["write_fault_one_in"] = 0
562 if dest_params
.get("open_files", 1) != -1:
563 # Compaction TTL and periodic compactions are only compatible
564 # with open_files = -1
565 dest_params
["compaction_ttl"] = 0
566 dest_params
["periodic_compaction_seconds"] = 0
567 if dest_params
.get("compaction_style", 0) == 2:
568 # Disable compaction TTL in FIFO compaction, because right
569 # now assertion failures are triggered.
570 dest_params
["compaction_ttl"] = 0
571 dest_params
["periodic_compaction_seconds"] = 0
572 if dest_params
["partition_filters"] == 1:
573 if dest_params
["index_type"] != 2:
574 dest_params
["partition_filters"] = 0
575 if dest_params
.get("atomic_flush", 0) == 1:
576 # disable pipelined write when atomic flush is used.
577 dest_params
["enable_pipelined_write"] = 0
578 if dest_params
.get("sst_file_manager_bytes_per_sec", 0) == 0:
579 dest_params
["sst_file_manager_bytes_per_truncate"] = 0
580 if dest_params
.get("enable_compaction_filter", 0) == 1:
581 # Compaction filter is incompatible with snapshots. Need to avoid taking
582 # snapshots, as well as avoid operations that use snapshots for
584 dest_params
["acquire_snapshot_one_in"] = 0
585 dest_params
["compact_range_one_in"] = 0
586 # Give the iterator ops away to reads.
587 dest_params
["readpercent"] += dest_params
.get("iterpercent", 10)
588 dest_params
["iterpercent"] = 0
589 if dest_params
.get("prefix_size") == -1:
590 dest_params
["readpercent"] += dest_params
.get("prefixpercent", 20)
591 dest_params
["prefixpercent"] = 0
593 dest_params
.get("prefix_size") == -1
594 and dest_params
.get("memtable_whole_key_filtering") == 0
596 dest_params
["memtable_prefix_bloom_size_ratio"] = 0
597 if dest_params
.get("two_write_queues") == 1:
598 dest_params
["enable_pipelined_write"] = 0
599 if dest_params
.get("best_efforts_recovery") == 1:
600 dest_params
["disable_wal"] = 1
601 dest_params
["atomic_flush"] = 0
602 dest_params
["enable_compaction_filter"] = 0
603 dest_params
["sync"] = 0
604 dest_params
["write_fault_one_in"] = 0
605 if dest_params
["secondary_cache_uri"] != "":
606 # Currently the only cache type compatible with a secondary cache is LRUCache
607 dest_params
["cache_type"] = "lru_cache"
608 # Remove the following once write-prepared/write-unprepared with/without
609 # unordered write supports timestamped snapshots
610 if dest_params
.get("create_timestamped_snapshot_one_in", 0) > 0:
611 dest_params
["txn_write_policy"] = 0
612 dest_params
["unordered_write"] = 0
613 # For TransactionDB, correctness testing with unsync data loss is currently
614 # compatible with only write committed policy
615 if (dest_params
.get("use_txn") == 1 and dest_params
.get("txn_write_policy") != 0):
616 dest_params
["sync_fault_injection"] = 0
617 dest_params
["manual_wal_flush_one_in"] = 0
618 # PutEntity is currently not supported by SstFileWriter or in conjunction with Merge
619 if dest_params
["use_put_entity_one_in"] != 0:
620 dest_params
["ingest_external_file_one_in"] = 0
621 dest_params
["use_merge"] = 0
622 dest_params
["use_full_merge_v1"] = 0
627 def gen_cmd_params(args
):
630 params
.update(default_params
)
631 if args
.test_type
== "blackbox":
632 params
.update(blackbox_default_params
)
633 if args
.test_type
== "whitebox":
634 params
.update(whitebox_default_params
)
636 params
.update(simple_default_params
)
637 if args
.test_type
== "blackbox":
638 params
.update(blackbox_simple_default_params
)
639 if args
.test_type
== "whitebox":
640 params
.update(whitebox_simple_default_params
)
641 if args
.cf_consistency
:
642 params
.update(cf_consistency_params
)
644 params
.update(txn_params
)
645 if args
.test_best_efforts_recovery
:
646 params
.update(best_efforts_recovery_params
)
648 params
.update(ts_params
)
649 if args
.test_multiops_txn
:
650 params
.update(multiops_txn_default_params
)
651 if args
.write_policy
== "write_committed":
652 params
.update(multiops_wc_txn_params
)
653 elif args
.write_policy
== "write_prepared":
654 params
.update(multiops_wp_txn_params
)
655 if args
.test_tiered_storage
:
656 params
.update(tiered_params
)
658 # Best-effort recovery, user defined timestamp, tiered storage are currently
659 # incompatible with BlobDB. Test BE recovery if specified on the command
660 # line; otherwise, apply BlobDB related overrides with a 10% chance.
662 not args
.test_best_efforts_recovery
663 and not args
.enable_ts
664 and not args
.test_tiered_storage
665 and random
.choice([0] * 9 + [1]) == 1
667 params
.update(blob_params
)
669 for k
, v
in vars(args
).items():
675 def gen_cmd(params
, unknown_params
):
676 finalzied_params
= finalize_and_sanitize(params
)
680 "--{0}={1}".format(k
, v
)
681 for k
, v
in [(k
, finalzied_params
[k
]) for k
in sorted(finalzied_params
)]
691 "test_best_efforts_recovery",
696 "test_tiered_storage",
706 def execute_cmd(cmd
, timeout
):
707 child
= subprocess
.Popen(cmd
, stderr
=subprocess
.PIPE
, stdout
=subprocess
.PIPE
)
708 print("Running db_stress with pid=%d: %s\n\n" % (child
.pid
, " ".join(cmd
)))
711 outs
, errs
= child
.communicate(timeout
=timeout
)
713 print("WARNING: db_stress ended before kill: exitcode=%d\n" % child
.returncode
)
714 except subprocess
.TimeoutExpired
:
717 print("KILLED %d\n" % child
.pid
)
718 outs
, errs
= child
.communicate()
720 return hit_timeout
, child
.returncode
, outs
.decode("utf-8"), errs
.decode("utf-8")
723 # This script runs and kills db_stress multiple times. It checks consistency
724 # in case of unsafe crashes in RocksDB.
725 def blackbox_crash_main(args
, unknown_args
):
726 cmd_params
= gen_cmd_params(args
)
727 dbname
= get_dbname("blackbox")
728 exit_time
= time
.time() + cmd_params
["duration"]
731 "Running blackbox-crash-test with \n"
732 + "interval_between_crash="
733 + str(cmd_params
["interval"])
736 + str(cmd_params
["duration"])
740 while time
.time() < exit_time
:
742 dict(list(cmd_params
.items()) + list({"db": dbname
}.items())), unknown_args
745 hit_timeout
, retcode
, outs
, errs
= execute_cmd(cmd
, cmd_params
["interval"])
748 print("Exit Before Killing")
755 for line
in errs
.split("\n"):
756 if line
!= "" and not line
.startswith("WARNING"):
757 print("stderr has error message:")
758 print("***" + line
+ "***")
760 time
.sleep(1) # time to stabilize before the next run
762 time
.sleep(1) # time to stabilize before the next run
764 # we need to clean up after ourselves -- only do this on test success
765 shutil
.rmtree(dbname
, True)
768 # This python script runs db_stress multiple times. Some runs with
769 # kill_random_test that causes rocksdb to crash at various points in code.
770 def whitebox_crash_main(args
, unknown_args
):
771 cmd_params
= gen_cmd_params(args
)
772 dbname
= get_dbname("whitebox")
774 cur_time
= time
.time()
775 exit_time
= cur_time
+ cmd_params
["duration"]
776 half_time
= cur_time
+ cmd_params
["duration"] // 2
779 "Running whitebox-crash-test with \n"
781 + str(cmd_params
["duration"])
787 kill_random_test
= cmd_params
["random_kill_odd"]
789 prev_compaction_style
= -1
790 while time
.time() < exit_time
:
793 # use large ops per thread since we will kill it anyway
794 "ops_per_thread": 100
795 * cmd_params
["ops_per_thread"],
797 # run with kill_random_test, with three modes.
798 # Mode 0 covers all kill points. Mode 1 covers less kill points but
799 # increases change of triggering them. Mode 2 covers even less
800 # frequent kill points and further increases triggering change.
802 additional_opts
.update(
804 "kill_random_test": kill_random_test
,
808 if cmd_params
.get("disable_wal", 0) == 1:
809 my_kill_odd
= kill_random_test
// 50 + 1
811 my_kill_odd
= kill_random_test
// 10 + 1
812 additional_opts
.update(
814 "kill_random_test": my_kill_odd
,
815 "kill_exclude_prefixes": "WritableFileWriter::Append,"
816 + "WritableFileWriter::WriteBuffered",
820 # TODO: May need to adjust random odds if kill_random_test
822 additional_opts
.update(
824 "kill_random_test": (kill_random_test
// 5000 + 1),
825 "kill_exclude_prefixes": "WritableFileWriter::Append,"
826 "WritableFileWriter::WriteBuffered,"
827 "PosixMmapFile::Allocate,WritableFileWriter::Flush",
830 # Run kill mode 0, 1 and 2 by turn.
831 kill_mode
= (kill_mode
+ 1) % 3
832 elif check_mode
== 1:
833 # normal run with universal compaction mode
835 "kill_random_test": None,
836 "ops_per_thread": cmd_params
["ops_per_thread"],
837 "compaction_style": 1,
839 # Single level universal has a lot of special logic. Ensure we cover
841 if random
.randint(0, 1) == 1:
842 additional_opts
.update(
847 elif check_mode
== 2:
848 # normal run with FIFO compaction mode
849 # ops_per_thread is divided by 5 because FIFO compaction
850 # style is quite a bit slower on reads with lot of files
852 "kill_random_test": None,
853 "ops_per_thread": cmd_params
["ops_per_thread"] // 5,
854 "compaction_style": 2,
859 "kill_random_test": None,
860 "ops_per_thread": cmd_params
["ops_per_thread"],
863 cur_compaction_style
= additional_opts
.get("compaction_style", cmd_params
.get("compaction_style", 0))
864 if prev_compaction_style
!= -1 and prev_compaction_style
!= cur_compaction_style
:
865 print("`compaction_style` is changed in current run so `destroy_db_initially` is set to 1 as a short-term solution to avoid cycling through previous db of different compaction style." + "\n")
866 additional_opts
["destroy_db_initially"] = 1
867 prev_compaction_style
= cur_compaction_style
871 list(cmd_params
.items())
872 + list(additional_opts
.items())
873 + list({"db": dbname
}.items())
879 "Running:" + " ".join(cmd
) + "\n"
880 ) # noqa: E999 T25377293 Grandfathered in
882 # If the running time is 15 minutes over the run time, explicit kill and
883 # exit even if white box kill didn't hit. This is to guarantee run time
884 # limit, as if it runs as a job, running too long will create problems
885 # for job scheduling or execution.
886 # TODO detect a hanging condition. The job might run too long as RocksDB
887 # hits a hanging bug.
888 hit_timeout
, retncode
, stdoutdata
, stderrdata
= execute_cmd(
889 cmd
, exit_time
- time
.time() + 900
891 msg
= "check_mode={0}, kill option={1}, exitcode={2}\n".format(
892 check_mode
, additional_opts
["kill_random_test"], retncode
900 print("Killing the run for running too long")
904 if additional_opts
["kill_random_test"] is None and (retncode
== 0):
905 # we expect zero retncode if no kill option
907 elif additional_opts
["kill_random_test"] is not None and retncode
<= 0:
908 # When kill option is given, the test MIGHT kill itself.
909 # If it does, negative retncode is expected. Otherwise 0.
913 print("TEST FAILED. See kill option and exit code above!!!\n")
916 stderrdata
= stderrdata
.lower()
917 errorcount
= stderrdata
.count("error") - stderrdata
.count("got errors 0 times")
918 print("#times error occurred in output is " + str(errorcount
) + "\n")
921 print("TEST FAILED. Output has 'error'!!!\n")
923 if stderrdata
.find("fail") >= 0:
924 print("TEST FAILED. Output has 'fail'!!!\n")
927 # First half of the duration, keep doing kill test. For the next half,
928 # try different modes.
929 if time
.time() > half_time
:
930 # we need to clean up after ourselves -- only do this on test
932 shutil
.rmtree(dbname
, True)
933 if cleanup_cmd
is not None:
934 print("Running DB cleanup command - %s\n" % cleanup_cmd
)
935 ret
= os
.system(cleanup_cmd
)
937 print("TEST FAILED. DB cleanup returned error %d\n" % ret
)
940 if (expected_values_dir
is not None):
941 shutil
.rmtree(expected_values_dir
, True)
942 os
.mkdir(expected_values_dir
)
944 check_mode
= (check_mode
+ 1) % total_check_mode
946 time
.sleep(1) # time to stabilize after a kill
953 parser
= argparse
.ArgumentParser(
954 description
="This script runs and kills \
955 db_stress multiple times"
957 parser
.add_argument("test_type", choices
=["blackbox", "whitebox"])
958 parser
.add_argument("--simple", action
="store_true")
959 parser
.add_argument("--cf_consistency", action
="store_true")
960 parser
.add_argument("--txn", action
="store_true")
961 parser
.add_argument("--test_best_efforts_recovery", action
="store_true")
962 parser
.add_argument("--enable_ts", action
="store_true")
963 parser
.add_argument("--test_multiops_txn", action
="store_true")
964 parser
.add_argument("--write_policy", choices
=["write_committed", "write_prepared"])
965 parser
.add_argument("--stress_cmd")
966 parser
.add_argument("--test_tiered_storage", action
="store_true")
967 parser
.add_argument("--cleanup_cmd")
970 list(default_params
.items())
971 + list(blackbox_default_params
.items())
972 + list(whitebox_default_params
.items())
973 + list(simple_default_params
.items())
974 + list(blackbox_simple_default_params
.items())
975 + list(whitebox_simple_default_params
.items())
976 + list(blob_params
.items())
977 + list(ts_params
.items())
978 + list(multiops_txn_default_params
.items())
979 + list(multiops_wc_txn_params
.items())
980 + list(multiops_wp_txn_params
.items())
981 + list(best_efforts_recovery_params
.items())
982 + list(cf_consistency_params
.items())
983 + list(tiered_params
.items())
984 + list(txn_params
.items())
987 for k
, v
in all_params
.items():
988 parser
.add_argument("--" + k
, type=type(v() if callable(v
) else v
))
989 # unknown_args are passed directly to db_stress
990 args
, unknown_args
= parser
.parse_known_args()
992 test_tmpdir
= os
.environ
.get(_TEST_DIR_ENV_VAR
)
993 if test_tmpdir
is not None and not os
.path
.isdir(test_tmpdir
):
995 "%s env var is set to a non-existent directory: %s"
996 % (_TEST_DIR_ENV_VAR
, test_tmpdir
)
1001 stress_cmd
= args
.stress_cmd
1002 if args
.cleanup_cmd
:
1003 cleanup_cmd
= args
.cleanup_cmd
1004 if args
.test_type
== "blackbox":
1005 blackbox_crash_main(args
, unknown_args
)
1006 if args
.test_type
== "whitebox":
1007 whitebox_crash_main(args
, unknown_args
)
1008 # Only delete the `expected_values_dir` if test passes
1009 if expected_values_dir
is not None:
1010 shutil
.rmtree(expected_values_dir
)
1011 if multiops_txn_key_spaces_file
is not None:
1012 os
.remove(multiops_txn_key_spaces_file
)
1015 if __name__
== "__main__":