]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/tools/db_crashtest.py
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / rocksdb / tools / db_crashtest.py
CommitLineData
7c673cae
FG
1#! /usr/bin/env python
2import os
7c673cae
FG
3import sys
4import time
5import random
7c673cae
FG
6import tempfile
7import subprocess
8import shutil
9import argparse
10
11# params overwrite priority:
12# for default:
11fdf7f2 13# default_params < {blackbox,whitebox}_default_params < args
7c673cae 14# for simple:
11fdf7f2
TL
15# default_params < {blackbox,whitebox}_default_params <
16# simple_default_params <
17# {blackbox,whitebox}_simple_default_params < args
494da23a
TL
18# for enable_atomic_flush:
19# default_params < {blackbox,whitebox}_default_params <
20# atomic_flush_params < args
11fdf7f2
TL
21
22expected_values_file = tempfile.NamedTemporaryFile()
7c673cae
FG
23
24default_params = {
11fdf7f2 25 "acquire_snapshot_one_in": 10000,
7c673cae
FG
26 "block_size": 16384,
27 "cache_size": 1048576,
11fdf7f2 28 "checkpoint_one_in": 1000000,
494da23a 29 "compression_type": "snappy",
11fdf7f2
TL
30 "compression_max_dict_bytes": lambda: 16384 * random.randint(0, 1),
31 "compression_zstd_max_train_bytes": lambda: 65536 * random.randint(0, 1),
32 "clear_column_family_one_in": 0,
33 "compact_files_one_in": 1000000,
34 "compact_range_one_in": 1000000,
494da23a
TL
35 "delpercent": 4,
36 "delrangepercent": 1,
7c673cae 37 "destroy_db_initially": 0,
11fdf7f2
TL
38 "enable_pipelined_write": lambda: random.randint(0, 1),
39 "expected_values_path": expected_values_file.name,
40 "flush_one_in": 1000000,
7c673cae
FG
41 "max_background_compactions": 20,
42 "max_bytes_for_level_base": 10485760,
43 "max_key": 100000000,
44 "max_write_buffer_number": 3,
7c673cae 45 "mmap_read": lambda: random.randint(0, 1),
11fdf7f2 46 "nooverwritepercent": 1,
7c673cae 47 "open_files": 500000,
7c673cae
FG
48 "prefixpercent": 5,
49 "progress_reports": 0,
50 "readpercent": 45,
494da23a 51 "recycle_log_file_num": lambda: random.randint(0, 1),
7c673cae 52 "reopen": 20,
11fdf7f2
TL
53 "snapshot_hold_ops": 100000,
54 "subcompactions": lambda: random.randint(1, 4),
7c673cae
FG
55 "target_file_size_base": 2097152,
56 "target_file_size_multiplier": 2,
11fdf7f2
TL
57 "use_direct_reads": lambda: random.randint(0, 1),
58 "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
59 "use_full_merge_v1": lambda: random.randint(0, 1),
60 "use_merge": lambda: random.randint(0, 1),
7c673cae
FG
61 "verify_checksum": 1,
62 "write_buffer_size": 4 * 1024 * 1024,
63 "writepercent": 35,
11fdf7f2
TL
64 "format_version": lambda: random.randint(2, 4),
65 "index_block_restart_interval": lambda: random.choice(range(1, 16)),
7c673cae
FG
66}
67
11fdf7f2
TL
68_TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
69
7c673cae
FG
70
71def get_dbname(test_name):
11fdf7f2 72 test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
7c673cae
FG
73 if test_tmpdir is None or test_tmpdir == "":
74 dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_' + test_name)
75 else:
76 dbname = test_tmpdir + "/rocksdb_crashtest_" + test_name
77 shutil.rmtree(dbname, True)
11fdf7f2 78 os.mkdir(dbname)
7c673cae
FG
79 return dbname
80
11fdf7f2
TL
81
82def is_direct_io_supported(dbname):
83 with tempfile.NamedTemporaryFile(dir=dbname) as f:
84 try:
85 os.open(f.name, os.O_DIRECT)
86 except:
87 return False
88 return True
89
90
7c673cae
FG
91blackbox_default_params = {
92 # total time for this script to test db_stress
93 "duration": 6000,
94 # time for one db_stress instance to run
95 "interval": 120,
96 # since we will be killing anyway, use large value for ops_per_thread
97 "ops_per_thread": 100000000,
98 "set_options_one_in": 10000,
99 "test_batches_snapshots": 1,
100}
101
102whitebox_default_params = {
103 "duration": 10000,
104 "log2_keys_per_lock": 10,
7c673cae 105 "ops_per_thread": 200000,
7c673cae 106 "random_kill_odd": 888887,
11fdf7f2 107 "test_batches_snapshots": lambda: random.randint(0, 1),
7c673cae
FG
108}
109
110simple_default_params = {
7c673cae 111 "allow_concurrent_memtable_write": lambda: random.randint(0, 1),
11fdf7f2 112 "column_families": 1,
7c673cae
FG
113 "max_background_compactions": 1,
114 "max_bytes_for_level_base": 67108864,
7c673cae 115 "memtablerep": "skip_list",
11fdf7f2
TL
116 "prefixpercent": 25,
117 "readpercent": 25,
7c673cae
FG
118 "target_file_size_base": 16777216,
119 "target_file_size_multiplier": 1,
120 "test_batches_snapshots": 0,
7c673cae 121 "write_buffer_size": 32 * 1024 * 1024,
7c673cae
FG
122}
123
124blackbox_simple_default_params = {
7c673cae 125 "open_files": -1,
7c673cae 126 "set_options_one_in": 0,
7c673cae
FG
127}
128
11fdf7f2 129whitebox_simple_default_params = {}
7c673cae 130
494da23a
TL
131atomic_flush_params = {
132 "disable_wal": 1,
133 "reopen": 0,
134 "test_atomic_flush": 1,
135 # use small value for write_buffer_size so that RocksDB triggers flush
136 # more frequently
137 "write_buffer_size": 1024 * 1024,
138}
139
7c673cae
FG
140
141def finalize_and_sanitize(src_params):
142 dest_params = dict([(k, v() if callable(v) else v)
143 for (k, v) in src_params.items()])
11fdf7f2
TL
144 if dest_params.get("compression_type") != "zstd" or \
145 dest_params.get("compression_max_dict_bytes") == 0:
146 dest_params["compression_zstd_max_train_bytes"] = 0
7c673cae
FG
147 if dest_params.get("allow_concurrent_memtable_write", 1) == 1:
148 dest_params["memtablerep"] = "skip_list"
11fdf7f2
TL
149 if dest_params["mmap_read"] == 1 or not is_direct_io_supported(
150 dest_params["db"]):
151 dest_params["use_direct_io_for_flush_and_compaction"] = 0
152 dest_params["use_direct_reads"] = 0
494da23a
TL
153 if dest_params.get("test_batches_snapshots") == 1:
154 dest_params["delpercent"] += dest_params["delrangepercent"]
155 dest_params["delrangepercent"] = 0
7c673cae
FG
156 return dest_params
157
158
159def gen_cmd_params(args):
160 params = {}
161
11fdf7f2
TL
162 params.update(default_params)
163 if args.test_type == 'blackbox':
164 params.update(blackbox_default_params)
165 if args.test_type == 'whitebox':
166 params.update(whitebox_default_params)
7c673cae
FG
167 if args.simple:
168 params.update(simple_default_params)
169 if args.test_type == 'blackbox':
170 params.update(blackbox_simple_default_params)
171 if args.test_type == 'whitebox':
172 params.update(whitebox_simple_default_params)
494da23a
TL
173 if args.enable_atomic_flush:
174 params.update(atomic_flush_params)
7c673cae 175
7c673cae
FG
176 for k, v in vars(args).items():
177 if v is not None:
178 params[k] = v
179 return params
180
181
11fdf7f2
TL
182def gen_cmd(params, unknown_params):
183 cmd = ['./db_stress'] + [
7c673cae
FG
184 '--{0}={1}'.format(k, v)
185 for k, v in finalize_and_sanitize(params).items()
186 if k not in set(['test_type', 'simple', 'duration', 'interval',
494da23a 187 'random_kill_odd', 'enable_atomic_flush'])
11fdf7f2 188 and v is not None] + unknown_params
7c673cae
FG
189 return cmd
190
191
192# This script runs and kills db_stress multiple times. It checks consistency
193# in case of unsafe crashes in RocksDB.
11fdf7f2 194def blackbox_crash_main(args, unknown_args):
7c673cae
FG
195 cmd_params = gen_cmd_params(args)
196 dbname = get_dbname('blackbox')
197 exit_time = time.time() + cmd_params['duration']
198
199 print("Running blackbox-crash-test with \n"
200 + "interval_between_crash=" + str(cmd_params['interval']) + "\n"
11fdf7f2 201 + "total-duration=" + str(cmd_params['duration']) + "\n")
7c673cae
FG
202
203 while time.time() < exit_time:
204 run_had_errors = False
205 killtime = time.time() + cmd_params['interval']
206
11fdf7f2
TL
207 cmd = gen_cmd(dict(
208 cmd_params.items() +
209 {'db': dbname}.items()), unknown_args)
7c673cae 210
11fdf7f2 211 child = subprocess.Popen(cmd, stderr=subprocess.PIPE)
7c673cae 212 print("Running db_stress with pid=%d: %s\n\n"
11fdf7f2 213 % (child.pid, ' '.join(cmd)))
7c673cae
FG
214
215 stop_early = False
216 while time.time() < killtime:
217 if child.poll() is not None:
218 print("WARNING: db_stress ended before kill: exitcode=%d\n"
219 % child.returncode)
220 stop_early = True
221 break
222 time.sleep(1)
223
224 if not stop_early:
225 if child.poll() is not None:
226 print("WARNING: db_stress ended before kill: exitcode=%d\n"
227 % child.returncode)
228 else:
229 child.kill()
230 print("KILLED %d\n" % child.pid)
231 time.sleep(1) # time to stabilize after a kill
232
233 while True:
234 line = child.stderr.readline().strip()
11fdf7f2
TL
235 if line == '':
236 break
237 elif not line.startswith('WARNING'):
7c673cae
FG
238 run_had_errors = True
239 print('stderr has error message:')
240 print('***' + line + '***')
7c673cae
FG
241
242 if run_had_errors:
243 sys.exit(2)
244
245 time.sleep(1) # time to stabilize before the next run
246
247 # we need to clean up after ourselves -- only do this on test success
248 shutil.rmtree(dbname, True)
249
250
251# This python script runs db_stress multiple times. Some runs with
252# kill_random_test that causes rocksdb to crash at various points in code.
11fdf7f2 253def whitebox_crash_main(args, unknown_args):
7c673cae
FG
254 cmd_params = gen_cmd_params(args)
255 dbname = get_dbname('whitebox')
256
257 cur_time = time.time()
258 exit_time = cur_time + cmd_params['duration']
259 half_time = cur_time + cmd_params['duration'] / 2
260
261 print("Running whitebox-crash-test with \n"
11fdf7f2 262 + "total-duration=" + str(cmd_params['duration']) + "\n")
7c673cae
FG
263
264 total_check_mode = 4
265 check_mode = 0
266 kill_random_test = cmd_params['random_kill_odd']
267 kill_mode = 0
268
269 while time.time() < exit_time:
270 if check_mode == 0:
271 additional_opts = {
272 # use large ops per thread since we will kill it anyway
273 "ops_per_thread": 100 * cmd_params['ops_per_thread'],
274 }
275 # run with kill_random_test, with three modes.
276 # Mode 0 covers all kill points. Mode 1 covers less kill points but
277 # increases change of triggering them. Mode 2 covers even less
278 # frequent kill points and further increases triggering change.
279 if kill_mode == 0:
280 additional_opts.update({
281 "kill_random_test": kill_random_test,
282 })
283 elif kill_mode == 1:
284 additional_opts.update({
285 "kill_random_test": (kill_random_test / 10 + 1),
286 "kill_prefix_blacklist": "WritableFileWriter::Append,"
287 + "WritableFileWriter::WriteBuffered",
288 })
289 elif kill_mode == 2:
290 # TODO: May need to adjust random odds if kill_random_test
291 # is too small.
292 additional_opts.update({
293 "kill_random_test": (kill_random_test / 5000 + 1),
294 "kill_prefix_blacklist": "WritableFileWriter::Append,"
295 "WritableFileWriter::WriteBuffered,"
296 "PosixMmapFile::Allocate,WritableFileWriter::Flush",
297 })
298 # Run kill mode 0, 1 and 2 by turn.
299 kill_mode = (kill_mode + 1) % 3
300 elif check_mode == 1:
301 # normal run with universal compaction mode
302 additional_opts = {
303 "kill_random_test": None,
304 "ops_per_thread": cmd_params['ops_per_thread'],
305 "compaction_style": 1,
306 }
307 elif check_mode == 2:
308 # normal run with FIFO compaction mode
309 # ops_per_thread is divided by 5 because FIFO compaction
310 # style is quite a bit slower on reads with lot of files
311 additional_opts = {
312 "kill_random_test": None,
313 "ops_per_thread": cmd_params['ops_per_thread'] / 5,
314 "compaction_style": 2,
315 }
316 else:
317 # normal run
11fdf7f2 318 additional_opts = {
7c673cae
FG
319 "kill_random_test": None,
320 "ops_per_thread": cmd_params['ops_per_thread'],
321 }
322
323 cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()
11fdf7f2 324 + {'db': dbname}.items()), unknown_args)
7c673cae 325
11fdf7f2 326 print "Running:" + ' '.join(cmd) + "\n" # noqa: E999 T25377293 Grandfathered in
7c673cae 327
11fdf7f2
TL
328 popen = subprocess.Popen(cmd, stdout=subprocess.PIPE,
329 stderr=subprocess.STDOUT)
7c673cae
FG
330 stdoutdata, stderrdata = popen.communicate()
331 retncode = popen.returncode
332 msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
333 check_mode, additional_opts['kill_random_test'], retncode))
334 print msg
335 print stdoutdata
336
337 expected = False
338 if additional_opts['kill_random_test'] is None and (retncode == 0):
339 # we expect zero retncode if no kill option
340 expected = True
341 elif additional_opts['kill_random_test'] is not None and retncode < 0:
342 # we expect negative retncode if kill option was given
343 expected = True
344
345 if not expected:
346 print "TEST FAILED. See kill option and exit code above!!!\n"
347 sys.exit(1)
348
349 stdoutdata = stdoutdata.lower()
350 errorcount = (stdoutdata.count('error') -
351 stdoutdata.count('got errors 0 times'))
352 print "#times error occurred in output is " + str(errorcount) + "\n"
353
354 if (errorcount > 0):
355 print "TEST FAILED. Output has 'error'!!!\n"
356 sys.exit(2)
357 if (stdoutdata.find('fail') >= 0):
358 print "TEST FAILED. Output has 'fail'!!!\n"
359 sys.exit(2)
360
361 # First half of the duration, keep doing kill test. For the next half,
362 # try different modes.
363 if time.time() > half_time:
364 # we need to clean up after ourselves -- only do this on test
365 # success
366 shutil.rmtree(dbname, True)
11fdf7f2
TL
367 os.mkdir(dbname)
368 cmd_params.pop('expected_values_path', None)
7c673cae
FG
369 check_mode = (check_mode + 1) % total_check_mode
370
371 time.sleep(1) # time to stabilize after a kill
372
373
374def main():
375 parser = argparse.ArgumentParser(description="This script runs and kills \
376 db_stress multiple times")
377 parser.add_argument("test_type", choices=["blackbox", "whitebox"])
378 parser.add_argument("--simple", action="store_true")
494da23a 379 parser.add_argument("--enable_atomic_flush", action='store_true')
7c673cae
FG
380
381 all_params = dict(default_params.items()
382 + blackbox_default_params.items()
383 + whitebox_default_params.items()
384 + simple_default_params.items()
385 + blackbox_simple_default_params.items()
386 + whitebox_simple_default_params.items())
387
388 for k, v in all_params.items():
389 parser.add_argument("--" + k, type=type(v() if callable(v) else v))
11fdf7f2
TL
390 # unknown_args are passed directly to db_stress
391 args, unknown_args = parser.parse_known_args()
392
393 test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
394 if test_tmpdir is not None and not os.path.isdir(test_tmpdir):
395 print('%s env var is set to a non-existent directory: %s' %
396 (_TEST_DIR_ENV_VAR, test_tmpdir))
397 sys.exit(1)
7c673cae
FG
398
399 if args.test_type == 'blackbox':
11fdf7f2 400 blackbox_crash_main(args, unknown_args)
7c673cae 401 if args.test_type == 'whitebox':
11fdf7f2 402 whitebox_crash_main(args, unknown_args)
7c673cae
FG
403
404if __name__ == '__main__':
405 main()